diff --git a/backend/src/chitai/services/book.py b/backend/src/chitai/services/book.py index 9fe0f52..d8cde00 100644 --- a/backend/src/chitai/services/book.py +++ b/backend/src/chitai/services/book.py @@ -44,11 +44,13 @@ from chitai.schemas.book import BooksCreateFromFiles from chitai.services.filesystem_library import BookPathGenerator from chitai.services.metadata_extractor import Extractor as MetadataExtractor from chitai.services.utils import ( + calculate_koreader_hash, cleanup_empty_parent_directories, delete_file, move_dir_contents, move_file, save_image, + StreamingHasher, ) @@ -172,18 +174,18 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]): file_metadata = [] for file in files: - stats = await aios.stat(file) file_size = stats.st_size content_type, _ = mimetypes.guess_type(file) + file_hash = await calculate_koreader_hash(file) filename = path_gen.generate_filename(data, Path(file.name)) - + file_metadata.append( FileMetadata( path=str(filename), size=file_size, - hash="stub-hash", # TODO: implement file hashing to catch duplicates + hash=file_hash, content_type=content_type, ) ) @@ -540,10 +542,13 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]): await file.seek(0) path = parent / filename path.parent.mkdir(parents=True, exist_ok=True) + + hasher = StreamingHasher() async with aiofiles.open(path, "wb") as dest: # Read spooled file and save it to the local filesystem while chunk := await file.read(CHUNK_SIZE): await dest.write(chunk) + hasher.update(chunk) stats = await aios.stat(path) file_size = stats.st_size @@ -552,7 +557,7 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]): FileMetadata( path=str(filename), size=file_size, - hash="stub-hash", # TODO: implement file hashing to catch duplicates + hash=hasher.hexdigest(), content_type=file.content_type, ) ) diff --git a/backend/src/chitai/services/utils.py b/backend/src/chitai/services/utils.py index d7486ec..6d714ee 100644 --- a/backend/src/chitai/services/utils.py +++ b/backend/src/chitai/services/utils.py @@ -1,9 +1,15 @@ # src/chitai/services/utils.py # Standard library +from __future__ import annotations + +import hashlib from pathlib import Path import shutil -from typing import BinaryIO +from typing import TYPE_CHECKING, BinaryIO + +if TYPE_CHECKING: + from hashlib import _Hash # Third-party libraries import PIL @@ -12,6 +18,120 @@ import aiofiles import aiofiles.os as aios from litestar.datastructures import UploadFile + +################################## +# KOReader file hash utilities # +################################## + +# KOReader partial MD5 constants +# These match KOReader's partial MD5 implementation for document identification +# KOReader samples 1024 bytes at specific offsets calculated using 32-bit left shift. +# The shift wrapping behavior (shift & 0x1F) causes i=-1 to produce offset 0. +# Offsets: 0, 1024, 4096, 16384, 65536, 262144, 1048576, ... +KO_STEP = 1024 +KO_SAMPLE_SIZE = 1024 +KO_INDICES = range(-1, 11) # -1 to 10 inclusive + + +def _lshift32(val: int, shift: int) -> int: + """ + 32-bit left shift matching LuaJIT's bit.lshift behavior. + + LuaJIT masks the shift amount to 5 bits (0-31) and performs 32-bit arithmetic. + This causes negative shifts to wrap: shift=-2 becomes shift=30, and + 1024 << 30 overflows 32 bits to produce 0. + """ + val &= 0xFFFFFFFF + shift &= 0x1F + return (val << shift) & 0xFFFFFFFF + + +def _get_koreader_offsets() -> list[int]: + """Get all KOReader sampling offsets.""" + return [_lshift32(KO_STEP, 2 * i) for i in KO_INDICES] + + +def _partial_md5_from_chunk( + chunk: bytes, + hasher: hashlib._Hash, + offsets: list[int], + chunk_start: int, +) -> None: + """ + Update partial MD5 hasher with sampled bytes from a chunk. + + KOReader samples 1024 bytes at specific offsets rather than hashing + the entire file. This function checks if any sampling offsets fall + within the current chunk and updates the hasher with those bytes. + + Args: + chunk: The current chunk of file data. + hasher: The MD5 hasher to update. + offsets: List of byte offsets to sample from the file. + chunk_start: The starting byte position of this chunk in the file. + """ + chunk_len = len(chunk) + for offset in offsets: + if chunk_start <= offset < chunk_start + chunk_len: + start = offset - chunk_start + end = min(start + KO_SAMPLE_SIZE, chunk_len) + hasher.update(chunk[start:end]) + + +async def calculate_koreader_hash(file_path: Path) -> str: + """ + Calculate KOReader-compatible partial MD5 hash for a file. + + KOReader uses a partial MD5 algorithm that samples 1024 bytes at specific + offsets rather than hashing the entire file. This provides fast document + identification for large ebook files. + + The offsets are calculated using 32-bit left shift: 1024 << (2*i) for i from -1 to 10. + Due to 32-bit overflow, i=-1 produces offset 0: + 0, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, ... + + Args: + file_path: Path to the file to hash. + + Returns: + The hexadecimal MD5 hash string. + """ + hasher = hashlib.md5() + offsets = _get_koreader_offsets() + + file_pos = 0 + chunk_size = 262144 # 256 KiB + + async with aiofiles.open(file_path, "rb") as f: + while chunk := await f.read(chunk_size): + _partial_md5_from_chunk(chunk, hasher, offsets, file_pos) + file_pos += len(chunk) + + return hasher.hexdigest() + + +class StreamingHasher: + """ + Helper class for calculating KOReader hash while streaming file data. + + Allows hash calculation during file writes without needing to re-read + the file after writing. + """ + + def __init__(self) -> None: + self.hasher = hashlib.md5() + self.offsets = _get_koreader_offsets() + self.position = 0 + + def update(self, chunk: bytes) -> None: + """Update hash with a chunk of data.""" + _partial_md5_from_chunk(chunk, self.hasher, self.offsets, self.position) + self.position += len(chunk) + + def hexdigest(self) -> str: + """Return the final hash.""" + return self.hasher.hexdigest() + ################################## # Filesystem related utilities # ##################################