Files
mangalib-epub-dumper/main.py
2025-12-21 23:48:01 +01:00

218 lines
6.4 KiB
Python

from typing import Any, Tuple
import zipfile
from ebooklib import epub
from pathlib import Path
from pydantic import BaseModel, TypeAdapter
from argparse import ArgumentParser
import json
class Author(BaseModel):
id: int
name: str
class BookMedia(BaseModel):
name: str
rusName: str
engName: str
otherNames: list[str]
authors: list[Author]
summary: str
class Branch(BaseModel):
id: int
branchId: int
def process_text(text_obj: dict):
content = text_obj["text"]
for mark in text_obj.get("marks", []):
if mark["type"] == "italic":
content = f"<i>{content}</i>"
elif mark["type"] == "bold":
content = f"<b>{content}</b>"
else:
raise ValueError(f"unknown mark for text {text_obj}")
return content
class Chapter(BaseModel):
id: int
volume: int
number: str
name: str
branches: list[Branch]
withBranches: bool
def get_zip_path(self, base_dir: Path) -> Path | None:
nobranch_zip = base_dir / f"v{self.volume}-n{self.number}-{self.id}.zip"
if nobranch_zip.exists():
return nobranch_zip
for branch in self.branches:
branch_zip = (
base_dir
/ f"v{self.volume}-n{self.number}-{self.id}-b{branch.branchId}.zip"
)
if branch_zip.exists():
return branch_zip
def load(self, base_dir: Path) -> Tuple[epub.EpubHtml, list[Any]]:
title = f"Глава {self.number}"
book_item = epub.EpubHtml(
title=title,
file_name="Text/ch{}.xhtml".format(self.number),
lang="ru",
)
zip_path = self.get_zip_path(base_dir)
if zip_path is None:
raise FileNotFoundError(f"Chapter for `{self}` not found")
zip = zipfile.ZipFile(zip_path)
data = zip.read("data.txt")
try:
content = json.loads(data)
except Exception:
# It's not a json, so we just attach content and return.
data = data.decode("utf-8")
content = f"<h2>Глава {self.number}</h2>\n{data}"
book_item.content = content
return (book_item, [])
if content["type"] != "doc":
raise ValueError(f"{self} contains unknown document format")
output = []
extras = []
print(f"parsing {zip.filename}")
output.append(f"<h2>Глава {self.number}</h2>")
for item in content["content"]:
if item["type"] in {"paragraph", "heading"}:
inner = []
for sub_item in item.get("content", []):
if sub_item["type"] == "text":
inner.append(process_text(sub_item))
elif sub_item["type"] == "hardBreak":
inner.append("<br />")
else:
raise ValueError(f"{self} - Unknown sub-item")
inner_content = "\n".join(inner)
if item["type"] == "heading":
attrs = item.get("attrs", {})
level = attrs.get("level", 3)
align = attrs.get("textAlign", "center")
output.append(
f'<h{level} align="{align}">{inner_content}</h{level}>'
)
if item["type"] == "paragraph":
output.append(f"<p>{inner_content}</p>")
elif item["type"] == "image":
for image in item["attrs"]["images"]:
image_name = image["image"]
image_path = f"{image_name}.png"
image_item = epub.EpubImage(
uid=image_name,
file_name=image_path,
content=zip.read(f"{image_name}.png"),
)
extras.append(image_item)
output.append(f'<img src="../{image_path}" />')
elif item["type"] == "horizontalRule":
output.append("<hr/>")
else:
raise ValueError(f"{self} - unknown content type")
# Connect all items
book_item.content = "\n".join(output)
return (book_item, extras)
def add_to_book(self, book: epub.EpubBook, base_dir: Path):
(item, extras) = self.load(base_dir)
for extra in extras:
book.add_item(extra)
book.add_item(item)
book.spine.append(item)
book.toc.append(item)
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--input-dir",
"-i",
dest="input",
type=Path,
help="Dumped book directory",
required=True,
)
parser.add_argument(
"--output-dir",
"-o",
dest="output",
type=Path,
default="output",
help="Where to put output EPUB files",
)
parser.add_argument(
"--volume",
type=int,
required=True,
)
parser.add_argument(
"--cover", "-c", type=Path, required=False, help="Path to cover image"
)
return parser.parse_args()
def main():
args = parse_args()
cover: bytes | None = None
if args.cover is not None:
cover = args.cover.read_bytes()
info: BookMedia = BookMedia.model_validate(
json.load((args.input / "info.json").open())["media"]
)
chapters: list[Chapter] = list(
sorted(
filter(
# Filter volumes
lambda c: c.volume == args.volume,
TypeAdapter(list[Chapter]).validate_python(
json.load((args.input / "chapters.json").open())
),
),
# Sort by chapter number
key=lambda c: float(c.number),
)
)
book = epub.EpubBook()
for author in info.authors:
book.add_author(author.name)
book.set_language("ru")
book.set_title(info.rusName)
book.add_metadata(namespace="DC", name="description", value=info.summary)
book.spine = []
if cover:
book.spine.insert(0, "cover")
book.set_cover(args.cover.name, cover)
for chapter in chapters:
chapter.add_to_book(book, args.input)
book.add_item(epub.EpubNcx())
output_path: Path = args.output / f"{info.rusName}-{args.volume}.epub"
output_path.parent.mkdir(parents=True, exist_ok=True)
epub.write_epub(str(output_path), book)
if __name__ == "__main__":
main()