Added tors images.

This commit is contained in:
2025-12-22 16:22:06 +01:00
parent 6bed4c4938
commit 57dca56789
6 changed files with 535 additions and 19 deletions

150
main.py
View File

@ -1,10 +1,17 @@
from typing import Any, Tuple
from yarl import URL
import copy
import zipfile
from ebooklib import epub
from pathlib import Path
from pydantic import BaseModel, TypeAdapter
from argparse import ArgumentParser
import json
import re
from requests_tor import RequestsTor
import requests
IMAGE_REMOVE_REGEX = re.compile(r"""<img\s+.*src\s*=\s*\"(?P<src>https?.*)\".*/>""")
class Author(BaseModel):
@ -58,6 +65,12 @@ class Chapter(BaseModel):
if branch_zip.exists():
return branch_zip
def get_image(self, image_id: str, zip: zipfile.ZipFile) -> Tuple[str, bytes]:
for file in zip.namelist():
if file.startswith(image_id):
return (file, zip.read(file))
raise FileNotFoundError(f"Image {image_id} is not found in the archive")
def load(self, base_dir: Path) -> Tuple[epub.EpubHtml, list[Any]]:
title = f"Глава {self.number}"
book_item = epub.EpubHtml(
@ -112,11 +125,11 @@ class Chapter(BaseModel):
elif item["type"] == "image":
for image in item["attrs"]["images"]:
image_name = image["image"]
image_path = f"{image_name}.png"
image_path, image_bytes = self.get_image(image_name, zip)
image_item = epub.EpubImage(
uid=image_name,
file_name=image_path,
content=zip.read(f"{image_name}.png"),
content=image_bytes,
)
extras.append(image_item)
output.append(f'<img src="../{image_path}" />')
@ -130,10 +143,81 @@ class Chapter(BaseModel):
return (book_item, extras)
def add_to_book(self, book: epub.EpubBook, base_dir: Path):
def request_with_id_update(
self,
client: requests.Session | RequestsTor,
url: str,
) -> bytes | None:
print("Fetching", url)
resp = client.get(url)
if resp.status_code == 429:
print("Ratelimit hit.")
if isinstance(client, RequestsTor):
print("Updating client's IP")
client.new_id()
resp = client.get(url)
if not resp.ok:
return None
return resp.content
def replace_images(
self,
page: epub.EpubHtml,
client: requests.Session | RequestsTor,
cache: Path,
) -> list[epub.EpubImage]:
new_content = copy.copy(page.content)
replaces = []
for match in IMAGE_REMOVE_REGEX.finditer(page.content): # type: ignore
target_str: str = page.content[match.start() : match.end()] # type: ignore
src: str = match.group("src")
url = URL(src)
strip_path = url.path.lstrip("/")
cached_path = cache / strip_path
if (cached_path).exists():
print(f"Using cached image from {cached_path}")
resp = cached_path.read_bytes()
else:
resp = self.request_with_id_update(client, src)
if resp:
cached_path.parent.mkdir(parents=True, exist_ok=True)
cached_path.write_bytes(resp)
if resp is None:
new_content = new_content.replace(target_str, "") # type: ignore
continue
img = epub.EpubImage(
file_name=strip_path,
content=resp,
)
replaces.append(img)
newiimage = f'<img src="../{strip_path}" />'
new_content = new_content.replace(target_str, newiimage) # type: ignore
page.content = new_content
return replaces
def add_to_book(
self,
book: epub.EpubBook,
base_dir: Path,
fetch_images: bool,
client: requests.Session | RequestsTor,
cache: Path,
):
(item, extras) = self.load(base_dir)
for extra in extras:
book.add_item(extra)
if not fetch_images:
item.content = IMAGE_REMOVE_REGEX.sub("", item.content)
else:
for image in self.replace_images(item, client, cache): # type: ignore
book.add_item(image)
book.add_item(item)
book.spine.append(item)
book.toc.append(item)
@ -160,11 +244,32 @@ def parse_args():
parser.add_argument(
"--volume",
type=int,
required=True,
required=False,
)
parser.add_argument(
"--cover", "-c", type=Path, required=False, help="Path to cover image"
"--cover",
"-c",
type=Path,
required=False,
help="Path to cover image",
)
parser.add_argument(
"--fetch-images",
action="store_true",
)
parser.add_argument(
"--cache",
type=Path,
default="cache",
help="Images cache directory",
)
parser.add_argument(
"--tor-ports",
type=lambda x: tuple(int(p) for p in x.split(",") if p),
default="",
)
parser.add_argument("--tor-controller-port", type=int, default=9051)
parser.add_argument("--tor-password", type=str, default=None)
return parser.parse_args()
@ -177,19 +282,15 @@ def main():
info: BookMedia = BookMedia.model_validate(
json.load((args.input / "info.json").open())["media"]
)
chapters: list[Chapter] = list(
sorted(
filter(
# Filter volumes
lambda c: c.volume == args.volume,
TypeAdapter(list[Chapter]).validate_python(
json.load((args.input / "chapters.json").open())
),
),
# Sort by chapter number
key=lambda c: float(c.number),
)
chapters: list[Chapter] = sorted(
TypeAdapter(list[Chapter]).validate_python(
json.load((args.input / "chapters.json").open())
),
key=lambda c: float(c.number),
)
if args.volume is not None:
chapters = [c for c in chapters if c.volume == args.volume]
book = epub.EpubBook()
for author in info.authors:
book.add_author(author.name)
@ -202,12 +303,23 @@ def main():
book.spine.insert(0, "cover")
book.set_cover(args.cover.name, cover)
if args.tor_ports:
print(f"Using tor ports: {args.tor_ports}")
client = RequestsTor(
tor_ports=args.tor_ports,
tor_cport=args.tor_controller_port,
password=args.tor_password,
)
else:
print("Using default client")
client = requests.Session()
for chapter in chapters:
chapter.add_to_book(book, args.input)
chapter.add_to_book(book, args.input, args.fetch_images, client, args.cache)
book.add_item(epub.EpubNcx())
output_path: Path = args.output / f"{info.rusName}-{args.volume}.epub"
output_path: Path = args.output / f"{info.rusName}-{args.volume or 'full'}.epub"
output_path.parent.mkdir(parents=True, exist_ok=True)
epub.write_epub(str(output_path), book)