diff --git a/main.py b/main.py index a919f47..92f8956 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,7 @@ import json import re from requests_tor import RequestsTor import requests +import hashlib IMAGE_REMOVE_REGEX = re.compile(r"""https?.*)\".*/>""") @@ -28,9 +29,17 @@ class BookMedia(BaseModel): summary: str +class BranchTeam(BaseModel): + name: str + + class Branch(BaseModel): id: int branchId: int + teams: list[BranchTeam] + + +IMG_HASHES: dict[bytes, str] = {} def process_text(text_obj: dict): @@ -53,6 +62,18 @@ class Chapter(BaseModel): branches: list[Branch] withBranches: bool + def get_translators(self, base_dir: Path) -> list[str]: + if len(self.branches) == 1: + return [team.name for team in self.branches[0].teams] + for branch in self.branches: + branch_zip = ( + base_dir + / f"v{self.volume}-n{self.number}-{self.id}-b{branch.branchId}.zip" + ) + if branch_zip.exists(): + return [team.name for team in branch.teams] + return [] + def get_zip_path(self, base_dir: Path) -> Path | None: nobranch_zip = base_dir / f"v{self.volume}-n{self.number}-{self.id}.zip" if nobranch_zip.exists(): @@ -83,6 +104,7 @@ class Chapter(BaseModel): if zip_path is None: raise FileNotFoundError(f"Chapter for `{self}` not found") zip = zipfile.ZipFile(zip_path) + print(f"Loaded {zip.filename}") data = zip.read("data.txt") try: content = json.loads(data) @@ -99,7 +121,6 @@ class Chapter(BaseModel): output = [] extras = [] - print(f"parsing {zip.filename}") output.append(f"

Глава {self.number}

") for item in content["content"]: if item["type"] in {"paragraph", "heading"}: @@ -189,12 +210,27 @@ class Chapter(BaseModel): new_content = new_content.replace(target_str, "") # type: ignore continue - img = epub.EpubImage( - file_name=strip_path, - content=resp, - ) - replaces.append(img) - newiimage = f'' + md5 = hashlib.md5(resp).digest() + final_path = None + if md5 in IMG_HASHES: + similar_path = IMG_HASHES[md5] + img_bytes = (cache / similar_path).read_bytes() + if img_bytes == resp: + print("Found identical image in cache") + final_path = similar_path + else: + IMG_HASHES[md5] = strip_path + + if final_path is None: + img = epub.EpubImage( + file_name=strip_path, + content=resp, + ) + replaces.append(img) + final_path = strip_path + + newiimage = f'' + print(target_str, newiimage) new_content = new_content.replace(target_str, newiimage) # type: ignore page.content = new_content @@ -298,7 +334,7 @@ def main(): book.set_title(info.rusName) book.add_metadata(namespace="DC", name="description", value=info.summary) - book.spine = [] + book.spine = ["translators"] if cover: book.spine.insert(0, "cover") book.set_cover(args.cover.name, cover) @@ -314,8 +350,25 @@ def main(): print("Using default client") client = requests.Session() + translators_page = epub.EpubHtml( + file_name="translators.xhtml", + title="Translators", + lang="ru", + ) + book.add_item(translators_page) + book.toc.append(translators_page) + book.spine.append(translators_page) # type: ignore + + translators = set() for chapter in chapters: chapter.add_to_book(book, args.input, args.fetch_images, client, args.cache) + translators.update(chapter.get_translators(args.input)) + + translators_content = '

Переводчики:

\n\n" + translators_page.content = translators_content book.add_item(epub.EpubNcx())