Source code for metabook.metabook

#!/usr/bin/env python3
# Core Library modules
import os
import re
import sys
from pathlib import Path
from typing import Optional

# Third party modules
import pdfplumber
import requests
from pdfrw import PdfDict, PdfReader, PdfWriter
from PyPDF2 import PdfReader as Reader
from PyPDF2.errors import PdfReadError
from requests import RequestException

# Local modules
from .cli import _parse_args
from .config import config
from .publishers import publisher_mapping, publishers

book_apis = {
    "google": "https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}",
}



[docs]
def find_books(directory: Path) -> list[Path]:
    """Finds all PDF files in the specified directory.

    Args:
        directory (Path): The Path object representing the directory to search for
        PDF files.

    Returns:
        List[Path]: A list of Path objects representing the matching PDF files.
    """
    directory_path = Path(directory)
    matching_files = []
    if config.RECURSE:
        for file_path in directory_path.rglob("*" + ".pdf"):
            matching_files.append(file_path)
    else:
        for file_path in directory_path.glob("*" + ".pdf"):
            matching_files.append(file_path)
    return matching_files




[docs]
def update_filename(book: Path, new_name: str) -> None:
    """Updates the filename of a PDF file.

    Args:
        book (Path): The original path to the PDF file.
        new_name (str): The new name for the PDF file (without extension).

    Raises:
        FileExistsError: If the file with the new name already exists.

    Notes:
        This function updates the filename of the provided PDF file by appending
        '.pdf' to the new name. If the file with the updated name already exists,
        it raises a FileExistsError and prints an error message indicating the
        conflict.
    """
    new_name = "".join([new_name, ".pdf"])
    new_path = book.with_name(new_name)
    try:
        book.rename(new_path)
    except FileExistsError:
        print(f"Cannot rename file. File: {new_name} already exists")




[docs]
def write_metadata(book: Path, new_name: str) -> None:
    """Writes metadata to a PDF file.

    Args:
        book (Path): The path to the PDF file.
        new_name (str): The new title to set for the PDF.

    Raises:
        ValueError: If an issue occurs with the PDF value.
        AttributeError: If an attribute error happens while updating metadata.
        PermissionError: If permission-related issues occur while writing metadata.

    Notes:
        This function updates the metadata (Title, Subject, Author, Keywords, Creator,
        Producer) of the provided PDF file with the new title. If any errors occur
        during the process, it catches and prints an error message.
    """
    metadata = PdfDict(
        Title=new_name,
        Subject="",
        Author="",
        Keywords="",
        Creator="",
        Producer="",
    )
    try:
        pdf_reader = PdfReader(book)
        pdf_reader.Info.update(metadata)
        PdfWriter().write(book, pdf_reader)
    except (ValueError, AttributeError, PermissionError):
        print("An error occurred writing metadata")




[docs]
def render_template(meta: dict[str, str]) -> str:
    """Renders a template based on metadata information.

    Args:
        meta (Dict[str, str]): A dictionary containing metadata information,
            with keys such as 'SUBTITLE' and 'TITLE'.

    Returns:
        str: The rendered template as a string.

    Notes:
        This function checks the provided metadata for 'SUBTITLE' and 'TITLE'.
        If 'SUBTITLE' is not 'None' and the combined length of 'TITLE' and 'SUBTITLE'
        along with the extra characters (' + 3') does not exceed the maximum
        title length specified in 'config.TITLE_LEN_MAX', it uses 'TEMPLATE1'
        to render the template based on the metadata. If the conditions are not met,
        it falls back to using 'TEMPLATE2' to render the template with the provided
         metadata.
    """
    if meta["SUBTITLE"] != "None":
        title_length = len(meta["TITLE"]) + len(meta["SUBTITLE"]) + 3
        if title_length <= config.TITLE_LEN_MAX:
            return config.TEMPLATE1.render(meta)
    return config.TEMPLATE2.render(meta)




[docs]
def sanitize_isbn(isbn_list: list[str]) -> list[str]:
    """Cleans and sanitizes a list of ISBN (International Standard Book Number) strings.

    Args:
        isbn_list (List[str]): A list of ISBN strings that may contain non-numeric
        characters.

    Returns:
        List[str]: A list of sanitized ISBN strings with non-numeric characters removed.
                   Only ISBN strings with exactly 13 numeric characters are included.
    """
    sanitized_list = []
    for isbn in isbn_list:
        sanitized_isbn = re.sub(r"\D", "", isbn)
        if len(sanitized_isbn) == 13:
            sanitized_list.append(sanitized_isbn)
    return sanitized_list




[docs]
def normalize_filename(name: str) -> str:
    """Normalizes a given filename by removing invalid characters, replacing certain
    characters, and applying additional formatting options based on configuration
    settings.

    Args:
        name (str): The input filename to be normalized.

    Returns:
        str: The normalized filename.

    Configuration Options:
        - ALLOW_SPACE (bool): If False, replaces spaces with underscores.
        - LOWERCASE_ONLY (bool): If True, converts the filename to lowercase.
    """
    name = "".join(c for c in name if c not in r'\/*?"<>|')
    # name = name.title()
    name = name.replace(":", "-")
    if config.ALLOW_SPACE is False:
        name = name.replace(" ", "_")
    if config.LOWERCASE_ONLY is True:
        name = name.lower()
    return name




[docs]
def hardcopy(book: str, isbn_list: list, new_name: str) -> None:
    """Writes information about a book to a 'hardcopy.txt' file.

    Args:
        book (str): The name or identifier of the original book.
        isbn_list (list): A list of ISBNs associated with the book.
        new_name (str): The new name or identifier for the book.

    Returns:
        None

    Notes:
        This function appends information about a book, such as ISBNs, the original
        book name, and the new book name to a 'hardcopy.txt' file. It opens the file
        in append mode, writes the information in a formatted manner, and closes the
        file.
    """
    with open("hardcopy.txt", mode="a", encoding="utf-8") as f:
        lines_to_write = f"{'*' * 90}\n{isbn_list}\nOld: {book}\nNew: {new_name}\n"
        f.write(lines_to_write)




[docs]
def output(
    old_name: Optional[str] = None,
    skip: bool = False,
    isbn_list: Optional[list[str]] = None,
    new_name: Optional[str] = None,
    no_meta: bool = False,
    no_isbn: bool = False,
) -> None:
    """Generates output based on specified parameters."""

    def the_end():  # type: ignore
        print(f"{'*' * 90}")

    if old_name:
        print(f"processing: {old_name}")
    if skip:
        print("...skipping previously processed file")
        the_end()
    if isbn_list:
        print(f"using isbns: {isbn_list}")
    if new_name:
        print(f"new name: {new_name}")
        the_end()
    if no_meta:
        print("meta information cannot be found")
        the_end()
    if no_isbn:
        print("isbn ids cannot be found")
        the_end()




[docs]
def text_block(string: str) -> str:
    """Formats a string into text blocks based on a specified line length.

    Args:
        string (str): The input string to be formatted into text blocks.

    Returns:
        str: The formatted string with line breaks based on the configured line length.

    Notes:
        This function formats the input string into text blocks, ensuring that each line
        does not exceed the configured line length (as specified in
        'config.LINE_LENGTH'). It breaks the string into lines, adding line breaks
        ('\n') to ensure that each line respects the maximum line length. The resulting
        formatted string is returned.
    """
    result = []
    final_result = ""
    current_line = ""
    words = string.split()
    for word in words:
        if len(current_line) + len(word) + 1 <= config.LINE_LENGTH:
            current_line = "".join([current_line, " ", word])
        else:
            result.append(current_line)
            current_line = word
    if current_line:
        result.append(current_line)
    for line in result:
        final_result = "".join([final_result, "\n", line])
    return final_result.lstrip()




[docs]
def find_isbn_in_pdf(pdf_file: Path) -> list[str]:
    """Extracts ISBNs from a PDF file using multiple regex patterns.

    Args:
        pdf_file (Path): The Path object representing the PDF file.

    Returns:
        List[str]: A list of ISBNs found in the PDF.

    Note:
        This function uses two regex patterns to search for ISBNs in the PDF.
        It stops searching after a specified number of pages
        (config.SEARCH_PAGES_ISBN).
    """
    isbn_list = []
    pattern1 = re.compile(r"(?i)ISBN(?:-13)?\D*(\d(?:\W*\d){12})", re.M)
    pattern2 = re.compile(
        r"(?:ISBN(?:-13)?:? )?(?=[0-9]{13}$|(?=(?:[0-9]+[- ]){4})[- 0-9]"
        r"{17}$)97[89][- ]?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9]",
        re.M,
    )

    patterns = (pattern1, pattern2)
    """
    def get_isbn(pat):
        isbns = []
        try:
            with pdfplumber.open(pdf_file) as pdf:
                for count, page in enumerate(pdf.pages):
                    print(f"page: {count}", end='')
                    text = page.extract_text()
                    matches = pat.findall(text)
                    if matches:
                        isbns.extend(matches)
                        break
                    if count > config.SEARCH_PAGES:
                        break
        except (ValueError, TypeError, KeyError):
            print("An error has occurred whilst trying to find the ISBN")
        return isbns
    """

    def get_isbn(pat) -> list[str]:  # type: ignore
        isbns = []
        try:
            with open(pdf_file, "rb") as pdf:
                pdf_reader = Reader(pdf)
                num_pages = len(pdf_reader.pages)
                for page_number in range(num_pages):
                    page = pdf_reader.pages[page_number]
                    text = page.extract_text()
                    matches = pat.findall(text)
                    if matches:
                        isbns.extend(matches)
                        break
                    if page_number > config.SEARCH_PAGES_ISBN:
                        break
        except (ValueError, TypeError, KeyError, IndexError, PdfReadError):
            print("An error has occurred whilst trying to find the ISBN")
        return isbns

    for index, pattern in enumerate(patterns):
        print(f"using pattern {index + 1}")
        isbn_list = get_isbn(pattern)
        if isbn_list:
            break

    return isbn_list




[docs]
def publisher_find(book: Path) -> Optional[str]:
    """Finds the publisher of a PDF book.

    Args:
        book (Path): The path to the PDF book.

    Returns:
        str: The found publisher's name, if identified.

    Notes:
        This function attempts to find the publisher of the provided PDF book.
        It first checks if any known publishers' names are in the book's filename.
        If not found, it searches through the text content of the book's pages
        using pdfplumber, returning the first identified publisher.

        If an error occurs during the search process, such as ValueError,
        TypeError, or KeyError, it prints an error message indicating the issue.
    """
    for publisher in publishers:
        if publisher in book.name:
            return publisher
    try:
        with pdfplumber.open(book) as pdf:
            for count, page in enumerate(pdf.pages):
                text = page.extract_text()
                for publisher in publishers:
                    if publisher in text:
                        return publisher
                if count > config.SEARCH_PAGES_PUB:
                    break
    except (ValueError, TypeError, KeyError):
        print("An error has occurred whilst trying to find the publisher")
    return None




[docs]
def fetch_book_metadata(isbn: str) -> dict:
    """Fetches book metadata from the Google Books API based on the provided ISBN.

    Args:
        isbn (str): The ISBN (International Standard Book Number) of the book.

    Returns:
        dict: A dictionary containing the fetched book metadata. The keys include:
              - "TITLE": Title of the book.
              - "SUBTITLE": Subtitle of the book (if available, otherwise "None").
              - "AUTHORS": List of authors of the book.
              - "DATE": Publication year of the book (first 4 characters of the full
                date, or "None" if not available).
              - "PUBLISHER": Publisher of the book, with possible mapping applied.
              - "ISBN": The provided ISBN.

    Note:
        This function queries the Google Books API using the provided ISBN to
        retrieve book metadata. The "PUBLISHER" field may undergo mapping based on
        the `publisher_mapping` dictionary.
    """
    meta = {}
    # url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    url = book_apis[config.API].format(isbn=isbn)
    try:
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if "items" in data and len(data["items"]) > 0:
                metadata = data["items"][0]["volumeInfo"]
                if metadata:
                    meta["TITLE"] = metadata.get("title", "None")
                    meta["SUBTITLE"] = metadata.get("subtitle", "None")
                    if config.GET_DESCRIPTION:
                        meta["DESCRIPTION"] = text_block(
                            metadata.get("description", "None")
                        )
                    meta["AUTHORS"] = metadata.get("authors", [])
                    meta["DATE"] = metadata.get("publishedDate", "None")[:4]
                    publisher = metadata.get("publisher", "None")
                    if publisher in publisher_mapping:
                        meta["PUBLISHER"] = publisher_mapping[publisher]
                    else:
                        meta["PUBLISHER"] = publisher
                    meta["ISBN"] = isbn

    except RequestException:
        print("An error occurred whilst getting book metadata")
    return meta




[docs]
def main():  # type: ignore
    args, parser = _parse_args(sys.argv[1:])

    if args.folder[0] == ".":
        folder = Path(os.getcwd())
    else:
        folder = Path(args.folder[0])
    if args.recurse:
        config.RECURSE = True
    if args.all:
        config.SKIP_EXISTING = False
    if args.dryrun:
        config.DRYRUN = True
    if args.log:
        config.HARDCOPY = True

    print(folder)

    if config.HARDCOPY_FILE.exists():
        config.HARDCOPY_FILE.unlink()
    try:
        books: list[Path] = find_books(folder)
        if books:
            for book in books:
                output(old_name=book.name)
                if config.SKIP_EXISTING and book.name.startswith("["):
                    output(skip=True)
                    continue
                else:
                    isbn_numbers: list[str] = find_isbn_in_pdf(book)
                    isbn_numbers = sanitize_isbn(isbn_numbers)
                    if isbn_numbers:
                        output(isbn_list=isbn_numbers)
                        isbn_number: str = isbn_numbers[0]
                        meta: dict[str, str] = fetch_book_metadata(isbn_number)
                        if meta:
                            if meta["PUBLISHER"] == "None":
                                found_publisher = publisher_find(book)
                                meta["PUBLISHER"] = (
                                    found_publisher
                                    if found_publisher is not None
                                    else "None"
                                )
                            new_name: str = render_template(meta)
                            new_name = normalize_filename(new_name)
                            output(new_name=new_name)
                            if config.HARDCOPY:
                                hardcopy(book.name, isbn_numbers, new_name)
                            if config.DRYRUN and meta:
                                continue
                            else:
                                write_metadata(book, new_name)
                                update_filename(book, new_name)
                        else:
                            output(no_meta=True)
                    else:
                        output(no_isbn=True)
        else:
            print("No books found")
    except KeyboardInterrupt:
        pass



if __name__ == "__main__":
    SystemExit(main())