feat: add KOReader compatible hash to file metadata

Implement KOReader's partial MD5 algorithm for document identification. This hash allows KOReader devices to match local files with server records for reading progress synchronization (KOSync).
This commit is contained in:
2026-03-09 13:38:07 -04:00
parent 930dbe9ba4
commit 20a69de968
2 changed files with 130 additions and 5 deletions

View File

@@ -44,11 +44,13 @@ from chitai.schemas.book import BooksCreateFromFiles
from chitai.services.filesystem_library import BookPathGenerator
from chitai.services.metadata_extractor import Extractor as MetadataExtractor
from chitai.services.utils import (
calculate_koreader_hash,
cleanup_empty_parent_directories,
delete_file,
move_dir_contents,
move_file,
save_image,
StreamingHasher,
)
@@ -172,18 +174,18 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
file_metadata = []
for file in files:
stats = await aios.stat(file)
file_size = stats.st_size
content_type, _ = mimetypes.guess_type(file)
file_hash = await calculate_koreader_hash(file)
filename = path_gen.generate_filename(data, Path(file.name))
file_metadata.append(
FileMetadata(
path=str(filename),
size=file_size,
hash="stub-hash", # TODO: implement file hashing to catch duplicates
hash=file_hash,
content_type=content_type,
)
)
@@ -540,10 +542,13 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
await file.seek(0)
path = parent / filename
path.parent.mkdir(parents=True, exist_ok=True)
hasher = StreamingHasher()
async with aiofiles.open(path, "wb") as dest:
# Read spooled file and save it to the local filesystem
while chunk := await file.read(CHUNK_SIZE):
await dest.write(chunk)
hasher.update(chunk)
stats = await aios.stat(path)
file_size = stats.st_size
@@ -552,7 +557,7 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
FileMetadata(
path=str(filename),
size=file_size,
hash="stub-hash", # TODO: implement file hashing to catch duplicates
hash=hasher.hexdigest(),
content_type=file.content_type,
)
)

View File

@@ -1,9 +1,15 @@
# src/chitai/services/utils.py
# Standard library
from __future__ import annotations
import hashlib
from pathlib import Path
import shutil
from typing import BinaryIO
from typing import TYPE_CHECKING, BinaryIO
if TYPE_CHECKING:
from hashlib import _Hash
# Third-party libraries
import PIL
@@ -12,6 +18,120 @@ import aiofiles
import aiofiles.os as aios
from litestar.datastructures import UploadFile
##################################
# KOReader file hash utilities #
##################################
# KOReader partial MD5 constants
# These match KOReader's partial MD5 implementation for document identification
# KOReader samples 1024 bytes at specific offsets calculated using 32-bit left shift.
# The shift wrapping behavior (shift & 0x1F) causes i=-1 to produce offset 0.
# Offsets: 0, 1024, 4096, 16384, 65536, 262144, 1048576, ...
KO_STEP = 1024
KO_SAMPLE_SIZE = 1024
KO_INDICES = range(-1, 11) # -1 to 10 inclusive
def _lshift32(val: int, shift: int) -> int:
"""
32-bit left shift matching LuaJIT's bit.lshift behavior.
LuaJIT masks the shift amount to 5 bits (0-31) and performs 32-bit arithmetic.
This causes negative shifts to wrap: shift=-2 becomes shift=30, and
1024 << 30 overflows 32 bits to produce 0.
"""
val &= 0xFFFFFFFF
shift &= 0x1F
return (val << shift) & 0xFFFFFFFF
def _get_koreader_offsets() -> list[int]:
"""Get all KOReader sampling offsets."""
return [_lshift32(KO_STEP, 2 * i) for i in KO_INDICES]
def _partial_md5_from_chunk(
chunk: bytes,
hasher: hashlib._Hash,
offsets: list[int],
chunk_start: int,
) -> None:
"""
Update partial MD5 hasher with sampled bytes from a chunk.
KOReader samples 1024 bytes at specific offsets rather than hashing
the entire file. This function checks if any sampling offsets fall
within the current chunk and updates the hasher with those bytes.
Args:
chunk: The current chunk of file data.
hasher: The MD5 hasher to update.
offsets: List of byte offsets to sample from the file.
chunk_start: The starting byte position of this chunk in the file.
"""
chunk_len = len(chunk)
for offset in offsets:
if chunk_start <= offset < chunk_start + chunk_len:
start = offset - chunk_start
end = min(start + KO_SAMPLE_SIZE, chunk_len)
hasher.update(chunk[start:end])
async def calculate_koreader_hash(file_path: Path) -> str:
"""
Calculate KOReader-compatible partial MD5 hash for a file.
KOReader uses a partial MD5 algorithm that samples 1024 bytes at specific
offsets rather than hashing the entire file. This provides fast document
identification for large ebook files.
The offsets are calculated using 32-bit left shift: 1024 << (2*i) for i from -1 to 10.
Due to 32-bit overflow, i=-1 produces offset 0:
0, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, ...
Args:
file_path: Path to the file to hash.
Returns:
The hexadecimal MD5 hash string.
"""
hasher = hashlib.md5()
offsets = _get_koreader_offsets()
file_pos = 0
chunk_size = 262144 # 256 KiB
async with aiofiles.open(file_path, "rb") as f:
while chunk := await f.read(chunk_size):
_partial_md5_from_chunk(chunk, hasher, offsets, file_pos)
file_pos += len(chunk)
return hasher.hexdigest()
class StreamingHasher:
"""
Helper class for calculating KOReader hash while streaming file data.
Allows hash calculation during file writes without needing to re-read
the file after writing.
"""
def __init__(self) -> None:
self.hasher = hashlib.md5()
self.offsets = _get_koreader_offsets()
self.position = 0
def update(self, chunk: bytes) -> None:
"""Update hash with a chunk of data."""
_partial_md5_from_chunk(chunk, self.hasher, self.offsets, self.position)
self.position += len(chunk)
def hexdigest(self) -> str:
"""Return the final hash."""
return self.hasher.hexdigest()
##################################
# Filesystem related utilities #
##################################