Added tors images.
This commit is contained in:
150
main.py
150
main.py
@ -1,10 +1,17 @@
|
||||
from typing import Any, Tuple
|
||||
from yarl import URL
|
||||
import copy
|
||||
import zipfile
|
||||
from ebooklib import epub
|
||||
from pathlib import Path
|
||||
from pydantic import BaseModel, TypeAdapter
|
||||
from argparse import ArgumentParser
|
||||
import json
|
||||
import re
|
||||
from requests_tor import RequestsTor
|
||||
import requests
|
||||
|
||||
IMAGE_REMOVE_REGEX = re.compile(r"""<img\s+.*src\s*=\s*\"(?P<src>https?.*)\".*/>""")
|
||||
|
||||
|
||||
class Author(BaseModel):
|
||||
@ -58,6 +65,12 @@ class Chapter(BaseModel):
|
||||
if branch_zip.exists():
|
||||
return branch_zip
|
||||
|
||||
def get_image(self, image_id: str, zip: zipfile.ZipFile) -> Tuple[str, bytes]:
|
||||
for file in zip.namelist():
|
||||
if file.startswith(image_id):
|
||||
return (file, zip.read(file))
|
||||
raise FileNotFoundError(f"Image {image_id} is not found in the archive")
|
||||
|
||||
def load(self, base_dir: Path) -> Tuple[epub.EpubHtml, list[Any]]:
|
||||
title = f"Глава {self.number}"
|
||||
book_item = epub.EpubHtml(
|
||||
@ -112,11 +125,11 @@ class Chapter(BaseModel):
|
||||
elif item["type"] == "image":
|
||||
for image in item["attrs"]["images"]:
|
||||
image_name = image["image"]
|
||||
image_path = f"{image_name}.png"
|
||||
image_path, image_bytes = self.get_image(image_name, zip)
|
||||
image_item = epub.EpubImage(
|
||||
uid=image_name,
|
||||
file_name=image_path,
|
||||
content=zip.read(f"{image_name}.png"),
|
||||
content=image_bytes,
|
||||
)
|
||||
extras.append(image_item)
|
||||
output.append(f'<img src="../{image_path}" />')
|
||||
@ -130,10 +143,81 @@ class Chapter(BaseModel):
|
||||
|
||||
return (book_item, extras)
|
||||
|
||||
def add_to_book(self, book: epub.EpubBook, base_dir: Path):
|
||||
def request_with_id_update(
|
||||
self,
|
||||
client: requests.Session | RequestsTor,
|
||||
url: str,
|
||||
) -> bytes | None:
|
||||
print("Fetching", url)
|
||||
resp = client.get(url)
|
||||
if resp.status_code == 429:
|
||||
print("Ratelimit hit.")
|
||||
if isinstance(client, RequestsTor):
|
||||
print("Updating client's IP")
|
||||
client.new_id()
|
||||
resp = client.get(url)
|
||||
|
||||
if not resp.ok:
|
||||
return None
|
||||
|
||||
return resp.content
|
||||
|
||||
def replace_images(
|
||||
self,
|
||||
page: epub.EpubHtml,
|
||||
client: requests.Session | RequestsTor,
|
||||
cache: Path,
|
||||
) -> list[epub.EpubImage]:
|
||||
new_content = copy.copy(page.content)
|
||||
replaces = []
|
||||
for match in IMAGE_REMOVE_REGEX.finditer(page.content): # type: ignore
|
||||
target_str: str = page.content[match.start() : match.end()] # type: ignore
|
||||
src: str = match.group("src")
|
||||
url = URL(src)
|
||||
strip_path = url.path.lstrip("/")
|
||||
cached_path = cache / strip_path
|
||||
if (cached_path).exists():
|
||||
print(f"Using cached image from {cached_path}")
|
||||
resp = cached_path.read_bytes()
|
||||
else:
|
||||
resp = self.request_with_id_update(client, src)
|
||||
if resp:
|
||||
cached_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
cached_path.write_bytes(resp)
|
||||
|
||||
if resp is None:
|
||||
new_content = new_content.replace(target_str, "") # type: ignore
|
||||
continue
|
||||
|
||||
img = epub.EpubImage(
|
||||
file_name=strip_path,
|
||||
content=resp,
|
||||
)
|
||||
replaces.append(img)
|
||||
newiimage = f'<img src="../{strip_path}" />'
|
||||
new_content = new_content.replace(target_str, newiimage) # type: ignore
|
||||
|
||||
page.content = new_content
|
||||
return replaces
|
||||
|
||||
def add_to_book(
|
||||
self,
|
||||
book: epub.EpubBook,
|
||||
base_dir: Path,
|
||||
fetch_images: bool,
|
||||
client: requests.Session | RequestsTor,
|
||||
cache: Path,
|
||||
):
|
||||
(item, extras) = self.load(base_dir)
|
||||
for extra in extras:
|
||||
book.add_item(extra)
|
||||
|
||||
if not fetch_images:
|
||||
item.content = IMAGE_REMOVE_REGEX.sub("", item.content)
|
||||
else:
|
||||
for image in self.replace_images(item, client, cache): # type: ignore
|
||||
book.add_item(image)
|
||||
|
||||
book.add_item(item)
|
||||
book.spine.append(item)
|
||||
book.toc.append(item)
|
||||
@ -160,11 +244,32 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--volume",
|
||||
type=int,
|
||||
required=True,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cover", "-c", type=Path, required=False, help="Path to cover image"
|
||||
"--cover",
|
||||
"-c",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Path to cover image",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fetch-images",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache",
|
||||
type=Path,
|
||||
default="cache",
|
||||
help="Images cache directory",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tor-ports",
|
||||
type=lambda x: tuple(int(p) for p in x.split(",") if p),
|
||||
default="",
|
||||
)
|
||||
parser.add_argument("--tor-controller-port", type=int, default=9051)
|
||||
parser.add_argument("--tor-password", type=str, default=None)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -177,19 +282,15 @@ def main():
|
||||
info: BookMedia = BookMedia.model_validate(
|
||||
json.load((args.input / "info.json").open())["media"]
|
||||
)
|
||||
chapters: list[Chapter] = list(
|
||||
sorted(
|
||||
filter(
|
||||
# Filter volumes
|
||||
lambda c: c.volume == args.volume,
|
||||
TypeAdapter(list[Chapter]).validate_python(
|
||||
json.load((args.input / "chapters.json").open())
|
||||
),
|
||||
),
|
||||
# Sort by chapter number
|
||||
key=lambda c: float(c.number),
|
||||
)
|
||||
chapters: list[Chapter] = sorted(
|
||||
TypeAdapter(list[Chapter]).validate_python(
|
||||
json.load((args.input / "chapters.json").open())
|
||||
),
|
||||
key=lambda c: float(c.number),
|
||||
)
|
||||
if args.volume is not None:
|
||||
chapters = [c for c in chapters if c.volume == args.volume]
|
||||
|
||||
book = epub.EpubBook()
|
||||
for author in info.authors:
|
||||
book.add_author(author.name)
|
||||
@ -202,12 +303,23 @@ def main():
|
||||
book.spine.insert(0, "cover")
|
||||
book.set_cover(args.cover.name, cover)
|
||||
|
||||
if args.tor_ports:
|
||||
print(f"Using tor ports: {args.tor_ports}")
|
||||
client = RequestsTor(
|
||||
tor_ports=args.tor_ports,
|
||||
tor_cport=args.tor_controller_port,
|
||||
password=args.tor_password,
|
||||
)
|
||||
else:
|
||||
print("Using default client")
|
||||
client = requests.Session()
|
||||
|
||||
for chapter in chapters:
|
||||
chapter.add_to_book(book, args.input)
|
||||
chapter.add_to_book(book, args.input, args.fetch_images, client, args.cache)
|
||||
|
||||
book.add_item(epub.EpubNcx())
|
||||
|
||||
output_path: Path = args.output / f"{info.rusName}-{args.volume}.epub"
|
||||
output_path: Path = args.output / f"{info.rusName}-{args.volume or 'full'}.epub"
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
epub.write_epub(str(output_path), book)
|
||||
|
||||
Reference in New Issue
Block a user