update metadata generation and add validation tests
All checks were successful
ci/woodpecker/push/tests Pipeline was successful

This commit is contained in:
Ivan Schaller 2023-02-15 13:50:59 +01:00
parent 3368b18677
commit d7c3d511fe
Signed by: olofvndrhr
GPG key ID: 2A6BE07D99C8C205
5 changed files with 142 additions and 71 deletions

View file

@ -4,6 +4,7 @@ loguru>=0.6.0
click>=8.1.3
click-option-group>=0.5.5
xmltodict>=0.13.0
xmlschema>=2.2.1
img2pdf>=0.4.4

View file

@ -1,63 +1,118 @@
from pathlib import Path
from typing import Any
import xmltodict
from loguru import logger as log
METADATA_TEMPLATE = Path("mangadlp/metadata/ComicInfo_v2.0.xml")
# define metadata types and valid values. an empty list means no value check
METADATA_TYPES: dict[str, tuple[type, Any, list]] = {
"Title": (str, None, []),
"Series": (str, None, []),
"Number": (str, None, []),
"Count": (int, None, []),
"Volume": (int, None, []),
"AlternateSeries": (str, None, []),
"AlternateNumber": (str, None, []),
"AlternateCount": (int, None, []),
"Summary": (str, None, []),
"Notes": (str, "Downloaded with https://github.com/olofvndrhr/manga-dlp", []),
"Year": (int, None, []),
"Month": (int, None, []),
"Day": (int, None, []),
"Writer": (str, None, []),
"Colorist": (str, None, []),
"Publisher": (str, None, []),
"Genre": (str, None, []),
"Web": (str, None, []),
"PageCount": (int, None, []),
"LanguageISO": (str, None, []),
"Format": (str, None, []),
"BlackAndWhite": (str, None, ["Yes", "No", "Unknown"]),
"Manga": (str, "Yes", ["Yes", "No", "Unknown", "YesAndRightToLeft"]),
"ScanInformation": (str, None, []),
"SeriesGroup": (str, None, []),
"AgeRating": (
str,
None,
[
"Unknown",
"Adults Only 18+",
"Early Childhood",
"Everyone",
"Everyone 10+",
"G",
"Kids to Adults",
"M",
"MA15+",
"Mature 17+",
"PG",
"R18+",
"Rating Pending",
"Teen",
"X18+",
],
),
"CommunityRating": (int, None, [1, 2, 3, 4, 5]),
}
def validate_metadata(metadata_in: dict) -> dict:
log.info("Validating metadata")
metadata_valid: dict[str, dict] = {"ComicInfo": {}}
for key, value in METADATA_TYPES.items():
metadata_type, metadata_default, metadata_validation = value
# add default value if present
if metadata_default:
log.info(f"Setting default value for Key:{key} -> value={metadata_default}")
metadata_valid["ComicInfo"][key] = metadata_default
# check if metadata key is available
try:
md_to_check = metadata_in[key]
except KeyError:
continue
# check if provided metadata item is empty
if not md_to_check:
continue
# check if metadata type is correct
log.debug(f"Key:{key} -> value={type(md_to_check)} -> check={metadata_type}")
if not isinstance(md_to_check, metadata_type): # noqa
log.warning(
f"Metadata has wrong type: {key}:{metadata_type} -> {md_to_check}"
)
continue
# check if metadata is valid
log.debug(f"Key:{key} -> value={md_to_check} -> valid={metadata_validation}")
if (len(metadata_validation) > 0) and (md_to_check not in metadata_validation):
log.warning(
f"Metadata is invalid: {key}:{metadata_validation} -> {md_to_check}"
)
continue
log.debug(f"Updating metadata: '{key}' = '{md_to_check}'")
metadata_valid["ComicInfo"][key] = md_to_check
return metadata_valid
def write_metadata(chapter_path: Path, metadata: dict) -> None:
if metadata["Format"] == "pdf":
log.warning("Can't add metadata for pdf format. Skipping")
return
# define metadata types
metadata_types: dict[str, type] = {
"Title": str,
"Series": str,
"Number": str,
"Count": int,
"Volume": int,
"Summary": str,
"Genre": str,
"Web": str,
"PageCount": int,
"LanguageISO": str,
"Format": str,
"ScanInformation": str,
"SeriesGroup": str,
}
try:
metadata_template = Path("mangadlp/metadata/ComicInfo.xml").read_text(
encoding="utf8"
)
metadata_empty: dict[str, dict] = xmltodict.parse(metadata_template)
except Exception as exc:
log.error("Can't open or parse xml template")
raise exc
metadata_file = chapter_path / "ComicInfo.xml"
log.info(f"Writing metadata to: '{metadata_file}'")
log.debug(f"Metadata items: {metadata}")
for key, value in metadata.items():
# check if metadata is empty
if not value:
continue
# try to match with template
try:
metadata_empty["ComicInfo"][key]
except KeyError:
continue
# check if metadata type is correct
log.debug(f"Key:{key} -> value={type(value)} -> check={metadata_types[key]}")
if not isinstance(value, metadata_types[key]): # noqa
log.warning(
f"Metadata has wrong type: {key}:{metadata_types[key]} -> {value}"
)
continue
metadata_valid = validate_metadata(metadata)
log.debug(f"Updating metadata: '{key}' = '{value}'")
metadata_empty["ComicInfo"][key] = value
metadata_export = xmltodict.unparse(metadata_empty, pretty=True, indent=" " * 4)
log.info(f"Writing metadata to: '{metadata_file}'")
metadata_export = xmltodict.unparse(
metadata_valid, pretty=True, indent=" " * 4, short_empty_elements=True
)
metadata_file.touch(exist_ok=True)
metadata_file.write_text(metadata_export, encoding="utf8")

View file

@ -1,20 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<ComicInfo>
<Title></Title>
<Series></Series>
<Number></Number>
<Count></Count>
<Volume></Volume>
<Summary></Summary>
<Genre></Genre>
<Web></Web>
<PageCount></PageCount>
<LanguageISO></LanguageISO>
<Format></Format>
<ScanInformation></ScanInformation>
<SeriesGroup></SeriesGroup>
<BlackAndWhite>Unknown</BlackAndWhite>
<AgeRating>Unknown</AgeRating>
<Manga>Yes</Manga>
<Notes>Downloaded with https://github.com/olofvndrhr/manga-dlp</Notes>
</ComicInfo>

View file

@ -6,15 +6,11 @@
<Count>10</Count>
<Volume>1</Volume>
<Summary>summary1</Summary>
<Notes>Downloaded with https://github.com/olofvndrhr/manga-dlp</Notes>
<Genre>genre1</Genre>
<Web>https://mangadex.org</Web>
<PageCount>99</PageCount>
<LanguageISO>en</LanguageISO>
<Format>cbz</Format>
<ScanInformation></ScanInformation>
<SeriesGroup></SeriesGroup>
<BlackAndWhite>Unknown</BlackAndWhite>
<AgeRating>Unknown</AgeRating>
<Manga>Yes</Manga>
<Notes>Downloaded with https://github.com/olofvndrhr/manga-dlp</Notes>
</ComicInfo>

View file

@ -1,5 +1,9 @@
import shutil
import subprocess
from pathlib import Path
import xmlschema
from mangadlp.metadata import write_metadata
@ -30,3 +34,38 @@ def test_metadata_creation():
# cleanup
metadata_file.unlink()
def test_metadata_chapter_validity():
url_uuid = "https://mangadex.org/title/76ee7069-23b4-493c-bc44-34ccbf3051a8/tomo-chan-wa-onna-no-ko"
manga_path = Path("tests/Tomo-chan wa Onna no ko")
metadata_path = Path(
"tests/Tomo-chan wa Onna no ko/Ch. 1 - Once In A Life Time Misfire/ComicInfo.xml"
)
language = "en"
chapters = "1"
download_path = "tests"
command_args = [
"-u",
url_uuid,
"-l",
language,
"-c",
chapters,
"--path",
download_path,
"--format",
"",
"--debug",
]
schema = xmlschema.XMLSchema("mangadlp/metadata/ComicInfo_v2.0.xsd")
script_path = "manga-dlp.py"
command = ["python3", script_path] + command_args
assert subprocess.call(command) == 0
assert metadata_path.is_file()
assert schema.is_valid(metadata_path)
# cleanup
shutil.rmtree(manga_path, ignore_errors=True)