feat: add KOReader compatible hash to file metadata

Implement KOReader's partial MD5 algorithm for document identification. This hash allows KOReader devices to match local files with server records for reading progress synchronization (KOSync).
2026-03-09 13:38:07 -04:00
parent 930dbe9ba4
commit 20a69de968
2 changed files with 130 additions and 5 deletions
@@ -44,11 +44,13 @@ from chitai.schemas.book import BooksCreateFromFiles
 from chitai.services.filesystem_library import BookPathGenerator
 from chitai.services.metadata_extractor import Extractor as MetadataExtractor
 from chitai.services.utils import (
    calculate_koreader_hash,
    cleanup_empty_parent_directories,
    delete_file,
    move_dir_contents,
    move_file,
    save_image,
    StreamingHasher,
 )
@@ -172,18 +174,18 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
            file_metadata = []
            for file in files:
                stats = await aios.stat(file)
                file_size = stats.st_size
                content_type, _ = mimetypes.guess_type(file)
                file_hash = await calculate_koreader_hash(file)
                filename = path_gen.generate_filename(data, Path(file.name))
-                
+
                file_metadata.append(
                    FileMetadata(
                        path=str(filename),
                        size=file_size,
-                        hash="stub-hash",  # TODO: implement file hashing to catch duplicates
+                        hash=file_hash,
                        content_type=content_type,
                    )
                )
@@ -540,10 +542,13 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
            await file.seek(0)
            path = parent / filename
            path.parent.mkdir(parents=True, exist_ok=True)
            hasher = StreamingHasher()
            async with aiofiles.open(path, "wb") as dest:
                # Read spooled file and save it to the local filesystem
                while chunk := await file.read(CHUNK_SIZE):
                    await dest.write(chunk)
                    hasher.update(chunk)
            stats = await aios.stat(path)
            file_size = stats.st_size
@@ -552,7 +557,7 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
                FileMetadata(
                    path=str(filename),
                    size=file_size,
-                    hash="stub-hash",  # TODO: implement file hashing to catch duplicates
+                    hash=hasher.hexdigest(),
                    content_type=file.content_type,
                )
            )
@@ -1,9 +1,15 @@
 # src/chitai/services/utils.py
 # Standard library
 from __future__ import annotations
 import hashlib
 from pathlib import Path
 import shutil
-from typing import BinaryIO
+from typing import TYPE_CHECKING, BinaryIO
 if TYPE_CHECKING:
    from hashlib import _Hash
 # Third-party libraries
 import PIL
@@ -12,6 +18,120 @@ import aiofiles
 import aiofiles.os as aios
 from litestar.datastructures import UploadFile
 ##################################
 #  KOReader file hash utilities  #
 ##################################
 # KOReader partial MD5 constants
 # These match KOReader's partial MD5 implementation for document identification
 # KOReader samples 1024 bytes at specific offsets calculated using 32-bit left shift.
 # The shift wrapping behavior (shift & 0x1F) causes i=-1 to produce offset 0.
 # Offsets: 0, 1024, 4096, 16384, 65536, 262144, 1048576, ...
 KO_STEP = 1024
 KO_SAMPLE_SIZE = 1024
 KO_INDICES = range(-1, 11)  # -1 to 10 inclusive
 def _lshift32(val: int, shift: int) -> int:
    """
    32-bit left shift matching LuaJIT's bit.lshift behavior.
    LuaJIT masks the shift amount to 5 bits (0-31) and performs 32-bit arithmetic.
    This causes negative shifts to wrap: shift=-2 becomes shift=30, and
    1024 << 30 overflows 32 bits to produce 0.
    """
    val &= 0xFFFFFFFF
    shift &= 0x1F
    return (val << shift) & 0xFFFFFFFF
 def _get_koreader_offsets() -> list[int]:
    """Get all KOReader sampling offsets."""
    return [_lshift32(KO_STEP, 2 * i) for i in KO_INDICES]
 def _partial_md5_from_chunk(
    chunk: bytes,
    hasher: hashlib._Hash,
    offsets: list[int],
    chunk_start: int,
 ) -> None:
    """
    Update partial MD5 hasher with sampled bytes from a chunk.
    KOReader samples 1024 bytes at specific offsets rather than hashing
    the entire file. This function checks if any sampling offsets fall
    within the current chunk and updates the hasher with those bytes.
    Args:
        chunk: The current chunk of file data.
        hasher: The MD5 hasher to update.
        offsets: List of byte offsets to sample from the file.
        chunk_start: The starting byte position of this chunk in the file.
    """
    chunk_len = len(chunk)
    for offset in offsets:
        if chunk_start <= offset < chunk_start + chunk_len:
            start = offset - chunk_start
            end = min(start + KO_SAMPLE_SIZE, chunk_len)
            hasher.update(chunk[start:end])
 async def calculate_koreader_hash(file_path: Path) -> str:
    """
    Calculate KOReader-compatible partial MD5 hash for a file.
    KOReader uses a partial MD5 algorithm that samples 1024 bytes at specific
    offsets rather than hashing the entire file. This provides fast document
    identification for large ebook files.
    The offsets are calculated using 32-bit left shift: 1024 << (2*i) for i from -1 to 10.
    Due to 32-bit overflow, i=-1 produces offset 0:
    0, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, ...
    Args:
        file_path: Path to the file to hash.
    Returns:
        The hexadecimal MD5 hash string.
    """
    hasher = hashlib.md5()
    offsets = _get_koreader_offsets()
    file_pos = 0
    chunk_size = 262144  # 256 KiB
    async with aiofiles.open(file_path, "rb") as f:
        while chunk := await f.read(chunk_size):
            _partial_md5_from_chunk(chunk, hasher, offsets, file_pos)
            file_pos += len(chunk)
    return hasher.hexdigest()
 class StreamingHasher:
    """
    Helper class for calculating KOReader hash while streaming file data.
    Allows hash calculation during file writes without needing to re-read
    the file after writing.
    """
    def __init__(self) -> None:
        self.hasher = hashlib.md5()
        self.offsets = _get_koreader_offsets()
        self.position = 0
    def update(self, chunk: bytes) -> None:
        """Update hash with a chunk of data."""
        _partial_md5_from_chunk(chunk, self.hasher, self.offsets, self.position)
        self.position += len(chunk)
    def hexdigest(self) -> str:
        """Return the final hash."""
        return self.hasher.hexdigest()
 ##################################
 #  Filesystem related utilities  #
 ##################################