Added tors images.

2025-12-22 16:22:06 +01:00
parent 6bed4c4938
commit 57dca56789
6 changed files with 535 additions and 19 deletions
--- a/main.py
+++ b/main.py
@ -1,10 +1,17 @@
 from typing import Any, Tuple
+from yarl import URL
+import copy
 import zipfile
 from ebooklib import epub
 from pathlib import Path
 from pydantic import BaseModel, TypeAdapter
 from argparse import ArgumentParser
 import json
+import re
+from requests_tor import RequestsTor
+import requests
+
+IMAGE_REMOVE_REGEX = re.compile(r"""<img\s+.*src\s*=\s*\"(?P<src>https?.*)\".*/>""")


 class Author(BaseModel):
@ -58,6 +65,12 @@ class Chapter(BaseModel):
            if branch_zip.exists():
                return branch_zip

+    def get_image(self, image_id: str, zip: zipfile.ZipFile) -> Tuple[str, bytes]:
+        for file in zip.namelist():
+            if file.startswith(image_id):
+                return (file, zip.read(file))
+        raise FileNotFoundError(f"Image {image_id} is not found in the archive")
+
    def load(self, base_dir: Path) -> Tuple[epub.EpubHtml, list[Any]]:
        title = f"Глава {self.number}"
        book_item = epub.EpubHtml(
@ -112,11 +125,11 @@ class Chapter(BaseModel):
            elif item["type"] == "image":
                for image in item["attrs"]["images"]:
                    image_name = image["image"]
-                    image_path = f"{image_name}.png"
+                    image_path, image_bytes = self.get_image(image_name, zip)
                    image_item = epub.EpubImage(
                        uid=image_name,
                        file_name=image_path,
-                        content=zip.read(f"{image_name}.png"),
+                        content=image_bytes,
                    )
                    extras.append(image_item)
                    output.append(f'<img src="../{image_path}" />')
@ -130,10 +143,81 @@ class Chapter(BaseModel):

        return (book_item, extras)

-    def add_to_book(self, book: epub.EpubBook, base_dir: Path):
+    def request_with_id_update(
+        self,
+        client: requests.Session | RequestsTor,
+        url: str,
+    ) -> bytes | None:
+        print("Fetching", url)
+        resp = client.get(url)
+        if resp.status_code == 429:
+            print("Ratelimit hit.")
+            if isinstance(client, RequestsTor):
+                print("Updating client's IP")
+                client.new_id()
+                resp = client.get(url)
+
+        if not resp.ok:
+            return None
+
+        return resp.content
+
+    def replace_images(
+        self,
+        page: epub.EpubHtml,
+        client: requests.Session | RequestsTor,
+        cache: Path,
+    ) -> list[epub.EpubImage]:
+        new_content = copy.copy(page.content)
+        replaces = []
+        for match in IMAGE_REMOVE_REGEX.finditer(page.content):  # type: ignore
+            target_str: str = page.content[match.start() : match.end()]  # type: ignore
+            src: str = match.group("src")
+            url = URL(src)
+            strip_path = url.path.lstrip("/")
+            cached_path = cache / strip_path
+            if (cached_path).exists():
+                print(f"Using cached image from {cached_path}")
+                resp = cached_path.read_bytes()
+            else:
+                resp = self.request_with_id_update(client, src)
+                if resp:
+                    cached_path.parent.mkdir(parents=True, exist_ok=True)
+                    cached_path.write_bytes(resp)
+
+            if resp is None:
+                new_content = new_content.replace(target_str, "")  # type: ignore
+                continue
+
+            img = epub.EpubImage(
+                file_name=strip_path,
+                content=resp,
+            )
+            replaces.append(img)
+            newiimage = f'<img src="../{strip_path}" />'
+            new_content = new_content.replace(target_str, newiimage)  # type: ignore
+
+        page.content = new_content
+        return replaces
+
+    def add_to_book(
+        self,
+        book: epub.EpubBook,
+        base_dir: Path,
+        fetch_images: bool,
+        client: requests.Session | RequestsTor,
+        cache: Path,
+    ):
        (item, extras) = self.load(base_dir)
        for extra in extras:
            book.add_item(extra)
+
+        if not fetch_images:
+            item.content = IMAGE_REMOVE_REGEX.sub("", item.content)
+        else:
+            for image in self.replace_images(item, client, cache):  # type: ignore
+                book.add_item(image)
+
        book.add_item(item)
        book.spine.append(item)
        book.toc.append(item)
@ -160,11 +244,32 @@ def parse_args():
    parser.add_argument(
        "--volume",
        type=int,
-        required=True,
+        required=False,
    )
    parser.add_argument(
-        "--cover", "-c", type=Path, required=False, help="Path to cover image"
+        "--cover",
+        "-c",
+        type=Path,
+        required=False,
+        help="Path to cover image",
    )
+    parser.add_argument(
+        "--fetch-images",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--cache",
+        type=Path,
+        default="cache",
+        help="Images cache directory",
+    )
+    parser.add_argument(
+        "--tor-ports",
+        type=lambda x: tuple(int(p) for p in x.split(",") if p),
+        default="",
+    )
+    parser.add_argument("--tor-controller-port", type=int, default=9051)
+    parser.add_argument("--tor-password", type=str, default=None)
    return parser.parse_args()


@ -177,19 +282,15 @@ def main():
    info: BookMedia = BookMedia.model_validate(
        json.load((args.input / "info.json").open())["media"]
    )
-    chapters: list[Chapter] = list(
-        sorted(
-            filter(
-                # Filter volumes
-                lambda c: c.volume == args.volume,
-                TypeAdapter(list[Chapter]).validate_python(
-                    json.load((args.input / "chapters.json").open())
-                ),
-            ),
-            # Sort by chapter number
-            key=lambda c: float(c.number),
-        )
+    chapters: list[Chapter] = sorted(
+        TypeAdapter(list[Chapter]).validate_python(
+            json.load((args.input / "chapters.json").open())
+        ),
+        key=lambda c: float(c.number),
    )
+    if args.volume is not None:
+        chapters = [c for c in chapters if c.volume == args.volume]
+
    book = epub.EpubBook()
    for author in info.authors:
        book.add_author(author.name)
@ -202,12 +303,23 @@ def main():
        book.spine.insert(0, "cover")
        book.set_cover(args.cover.name, cover)

+    if args.tor_ports:
+        print(f"Using tor ports: {args.tor_ports}")
+        client = RequestsTor(
+            tor_ports=args.tor_ports,
+            tor_cport=args.tor_controller_port,
+            password=args.tor_password,
+        )
+    else:
+        print("Using default client")
+        client = requests.Session()
+
    for chapter in chapters:
-        chapter.add_to_book(book, args.input)
+        chapter.add_to_book(book, args.input, args.fetch_images, client, args.cache)

    book.add_item(epub.EpubNcx())

-    output_path: Path = args.output / f"{info.rusName}-{args.volume}.epub"
+    output_path: Path = args.output / f"{info.rusName}-{args.volume or 'full'}.epub"
    output_path.parent.mkdir(parents=True, exist_ok=True)

    epub.write_epub(str(output_path), book)