feat: add KOReader compatible hash to file metadata
Implement KOReader's partial MD5 algorithm for document identification. This hash allows KOReader devices to match local files with server records for reading progress synchronization (KOSync).
This commit is contained in:
@@ -44,11 +44,13 @@ from chitai.schemas.book import BooksCreateFromFiles
|
|||||||
from chitai.services.filesystem_library import BookPathGenerator
|
from chitai.services.filesystem_library import BookPathGenerator
|
||||||
from chitai.services.metadata_extractor import Extractor as MetadataExtractor
|
from chitai.services.metadata_extractor import Extractor as MetadataExtractor
|
||||||
from chitai.services.utils import (
|
from chitai.services.utils import (
|
||||||
|
calculate_koreader_hash,
|
||||||
cleanup_empty_parent_directories,
|
cleanup_empty_parent_directories,
|
||||||
delete_file,
|
delete_file,
|
||||||
move_dir_contents,
|
move_dir_contents,
|
||||||
move_file,
|
move_file,
|
||||||
save_image,
|
save_image,
|
||||||
|
StreamingHasher,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -172,18 +174,18 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
|
|||||||
file_metadata = []
|
file_metadata = []
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
|
|
||||||
stats = await aios.stat(file)
|
stats = await aios.stat(file)
|
||||||
file_size = stats.st_size
|
file_size = stats.st_size
|
||||||
content_type, _ = mimetypes.guess_type(file)
|
content_type, _ = mimetypes.guess_type(file)
|
||||||
|
file_hash = await calculate_koreader_hash(file)
|
||||||
|
|
||||||
filename = path_gen.generate_filename(data, Path(file.name))
|
filename = path_gen.generate_filename(data, Path(file.name))
|
||||||
|
|
||||||
file_metadata.append(
|
file_metadata.append(
|
||||||
FileMetadata(
|
FileMetadata(
|
||||||
path=str(filename),
|
path=str(filename),
|
||||||
size=file_size,
|
size=file_size,
|
||||||
hash="stub-hash", # TODO: implement file hashing to catch duplicates
|
hash=file_hash,
|
||||||
content_type=content_type,
|
content_type=content_type,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -540,10 +542,13 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
|
|||||||
await file.seek(0)
|
await file.seek(0)
|
||||||
path = parent / filename
|
path = parent / filename
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
hasher = StreamingHasher()
|
||||||
async with aiofiles.open(path, "wb") as dest:
|
async with aiofiles.open(path, "wb") as dest:
|
||||||
# Read spooled file and save it to the local filesystem
|
# Read spooled file and save it to the local filesystem
|
||||||
while chunk := await file.read(CHUNK_SIZE):
|
while chunk := await file.read(CHUNK_SIZE):
|
||||||
await dest.write(chunk)
|
await dest.write(chunk)
|
||||||
|
hasher.update(chunk)
|
||||||
|
|
||||||
stats = await aios.stat(path)
|
stats = await aios.stat(path)
|
||||||
file_size = stats.st_size
|
file_size = stats.st_size
|
||||||
@@ -552,7 +557,7 @@ class BookService(SQLAlchemyAsyncRepositoryService[Book]):
|
|||||||
FileMetadata(
|
FileMetadata(
|
||||||
path=str(filename),
|
path=str(filename),
|
||||||
size=file_size,
|
size=file_size,
|
||||||
hash="stub-hash", # TODO: implement file hashing to catch duplicates
|
hash=hasher.hexdigest(),
|
||||||
content_type=file.content_type,
|
content_type=file.content_type,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,9 +1,15 @@
|
|||||||
# src/chitai/services/utils.py
|
# src/chitai/services/utils.py
|
||||||
|
|
||||||
# Standard library
|
# Standard library
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import shutil
|
import shutil
|
||||||
from typing import BinaryIO
|
from typing import TYPE_CHECKING, BinaryIO
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from hashlib import _Hash
|
||||||
|
|
||||||
# Third-party libraries
|
# Third-party libraries
|
||||||
import PIL
|
import PIL
|
||||||
@@ -12,6 +18,120 @@ import aiofiles
|
|||||||
import aiofiles.os as aios
|
import aiofiles.os as aios
|
||||||
from litestar.datastructures import UploadFile
|
from litestar.datastructures import UploadFile
|
||||||
|
|
||||||
|
|
||||||
|
##################################
|
||||||
|
# KOReader file hash utilities #
|
||||||
|
##################################
|
||||||
|
|
||||||
|
# KOReader partial MD5 constants
|
||||||
|
# These match KOReader's partial MD5 implementation for document identification
|
||||||
|
# KOReader samples 1024 bytes at specific offsets calculated using 32-bit left shift.
|
||||||
|
# The shift wrapping behavior (shift & 0x1F) causes i=-1 to produce offset 0.
|
||||||
|
# Offsets: 0, 1024, 4096, 16384, 65536, 262144, 1048576, ...
|
||||||
|
KO_STEP = 1024
|
||||||
|
KO_SAMPLE_SIZE = 1024
|
||||||
|
KO_INDICES = range(-1, 11) # -1 to 10 inclusive
|
||||||
|
|
||||||
|
|
||||||
|
def _lshift32(val: int, shift: int) -> int:
|
||||||
|
"""
|
||||||
|
32-bit left shift matching LuaJIT's bit.lshift behavior.
|
||||||
|
|
||||||
|
LuaJIT masks the shift amount to 5 bits (0-31) and performs 32-bit arithmetic.
|
||||||
|
This causes negative shifts to wrap: shift=-2 becomes shift=30, and
|
||||||
|
1024 << 30 overflows 32 bits to produce 0.
|
||||||
|
"""
|
||||||
|
val &= 0xFFFFFFFF
|
||||||
|
shift &= 0x1F
|
||||||
|
return (val << shift) & 0xFFFFFFFF
|
||||||
|
|
||||||
|
|
||||||
|
def _get_koreader_offsets() -> list[int]:
|
||||||
|
"""Get all KOReader sampling offsets."""
|
||||||
|
return [_lshift32(KO_STEP, 2 * i) for i in KO_INDICES]
|
||||||
|
|
||||||
|
|
||||||
|
def _partial_md5_from_chunk(
|
||||||
|
chunk: bytes,
|
||||||
|
hasher: hashlib._Hash,
|
||||||
|
offsets: list[int],
|
||||||
|
chunk_start: int,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Update partial MD5 hasher with sampled bytes from a chunk.
|
||||||
|
|
||||||
|
KOReader samples 1024 bytes at specific offsets rather than hashing
|
||||||
|
the entire file. This function checks if any sampling offsets fall
|
||||||
|
within the current chunk and updates the hasher with those bytes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chunk: The current chunk of file data.
|
||||||
|
hasher: The MD5 hasher to update.
|
||||||
|
offsets: List of byte offsets to sample from the file.
|
||||||
|
chunk_start: The starting byte position of this chunk in the file.
|
||||||
|
"""
|
||||||
|
chunk_len = len(chunk)
|
||||||
|
for offset in offsets:
|
||||||
|
if chunk_start <= offset < chunk_start + chunk_len:
|
||||||
|
start = offset - chunk_start
|
||||||
|
end = min(start + KO_SAMPLE_SIZE, chunk_len)
|
||||||
|
hasher.update(chunk[start:end])
|
||||||
|
|
||||||
|
|
||||||
|
async def calculate_koreader_hash(file_path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Calculate KOReader-compatible partial MD5 hash for a file.
|
||||||
|
|
||||||
|
KOReader uses a partial MD5 algorithm that samples 1024 bytes at specific
|
||||||
|
offsets rather than hashing the entire file. This provides fast document
|
||||||
|
identification for large ebook files.
|
||||||
|
|
||||||
|
The offsets are calculated using 32-bit left shift: 1024 << (2*i) for i from -1 to 10.
|
||||||
|
Due to 32-bit overflow, i=-1 produces offset 0:
|
||||||
|
0, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, ...
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to hash.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The hexadecimal MD5 hash string.
|
||||||
|
"""
|
||||||
|
hasher = hashlib.md5()
|
||||||
|
offsets = _get_koreader_offsets()
|
||||||
|
|
||||||
|
file_pos = 0
|
||||||
|
chunk_size = 262144 # 256 KiB
|
||||||
|
|
||||||
|
async with aiofiles.open(file_path, "rb") as f:
|
||||||
|
while chunk := await f.read(chunk_size):
|
||||||
|
_partial_md5_from_chunk(chunk, hasher, offsets, file_pos)
|
||||||
|
file_pos += len(chunk)
|
||||||
|
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
class StreamingHasher:
|
||||||
|
"""
|
||||||
|
Helper class for calculating KOReader hash while streaming file data.
|
||||||
|
|
||||||
|
Allows hash calculation during file writes without needing to re-read
|
||||||
|
the file after writing.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.hasher = hashlib.md5()
|
||||||
|
self.offsets = _get_koreader_offsets()
|
||||||
|
self.position = 0
|
||||||
|
|
||||||
|
def update(self, chunk: bytes) -> None:
|
||||||
|
"""Update hash with a chunk of data."""
|
||||||
|
_partial_md5_from_chunk(chunk, self.hasher, self.offsets, self.position)
|
||||||
|
self.position += len(chunk)
|
||||||
|
|
||||||
|
def hexdigest(self) -> str:
|
||||||
|
"""Return the final hash."""
|
||||||
|
return self.hasher.hexdigest()
|
||||||
|
|
||||||
##################################
|
##################################
|
||||||
# Filesystem related utilities #
|
# Filesystem related utilities #
|
||||||
##################################
|
##################################
|
||||||
|
|||||||
Reference in New Issue
Block a user