Source code for retype.controllers.library

import os
import json
import logging
from lxml.html import fromstring, builder, tostring, xhtml_to_html
from ebooklib import epub
from qt import QTextBrowser

from retype.extras.utils import isspaceorempty
from retype.extras.hashing import generate_file_md5

logger = logging.getLogger(__name__)


[docs]class LibraryController(object):
[docs] def __init__(self, user_dir, library_paths): self.user_dir = user_dir self.library_paths = library_paths self._library_items = self.indexLibrary(library_paths) self.books = None self.save_file_contents = None
@property def user_dir(self): return self._user_dir @user_dir.setter def user_dir(self, value): self._user_dir = value self.save_abs_path = os.path.join(value, 'save.json')
[docs] def indexLibrary(self, library_paths): book_checksum_list = [] library_items = {} idn = 0 for library_path in library_paths: for root, dirs, files in os.walk(library_path): for f in files: if f.lower().endswith(".epub"): path = os.path.join(root, f) checksum = generate_file_md5(path) if checksum in book_checksum_list: continue book_checksum_list.append(checksum) library_items[idn] = LibraryItem(idn, path, checksum) idn += 1 return library_items
[docs] def instantiateBooks(self): self.books = {} for idn, item in self._library_items.items(): book = BookWrapper(item, self.load(item)) self.books[idn] = book
[docs] def setBook(self, book_id, book_view, switchView): book_view.maybeSave() if book_id in self.books: book = self.books[book_id] logger.info("Loading book {}: {}".format(book_id, book.title)) else: logging.error("book_id {} cannot be found".format(book_id)) logging.debug("books: {}".format(self.books)) return save_data = book.save_data logger.info("Save data: {}".format(save_data)) book_view.setBook(book, save_data) switchView.emit(2) book_view.display.centreAroundCursor()
[docs] def save(self, book, data): self.addFriendlyName(data, book.path) key = book.checksum save = self.save_file_contents if (save): save[key] = data else: save = self.save_file_contents = {key: data} with open(self.save_abs_path, 'w', encoding='utf-8') as f: json.dump(save, f, indent=2) book.save_data = data
[docs] def migrateV1Save(self, save): book_checksum_list = [] new_save = {} for key in save: checksum = None if key.lower().endswith(".epub"): if (not os.path.exists(key)): logger.warning(f"Save file contains v1 format save data for\ a file that cannot be found: {key}") # Not a checksum; can’t generate it as we cannot find the # file, but setting it anyway in order that we keep the # save entry rather than delete it from the save. This # save data entry cannot be used by retype, but we should # not delete it as that would be unnecessary data loss and # user could correct the issue by fixing the path, # replacing it with a checksum, or moving the file back. checksum = key else: checksum = generate_file_md5(key) self.addFriendlyName(save[key], key) else: # assume it’s a checksum checksum = key if checksum in book_checksum_list: logger.warning(f"Save file contains several entries for the same\ book (checksum {checksum}). The lowest one in the file will be used") else: book_checksum_list.append(checksum) new_save[checksum] = save[key] return new_save
[docs] def addFriendlyName(self, data, path): data['friendly_name'] = os.path.basename(path)
[docs] def loadSaveFile(self): if os.path.exists(self.save_abs_path): logger.info(f'Read save: {self.save_abs_path}') with open(self.save_abs_path, 'r') as f: save = json.load(f) else: logger.debug( f'Save path {self.save_abs_path} not found.\n' 'This is normal if the save file has not been created yet.') save = {} save = self.migrateV1Save(save) self.save_file_contents = save return save
[docs] def load(self, item): save = None if self.save_file_contents is not None: save = self.save_file_contents else: save = self.loadSaveFile() key = item.checksum if save and key in save: return save[key] return None
[docs]class LibraryItem:
[docs] def __init__(self, idn, path, checksum): self.idn = idn self.path = path self.checksum = checksum
[docs]class BookWrapper(object):
[docs] def __init__(self, library_item, save_data=None): self._library_item = library_item self.path = library_item.path self.idn = library_item.idn self.checksum = library_item.checksum self._book = epub.read_epub(self.path, options={'ignore_ncx': True}) self.title = self._book.title self._chapters = None self._images = None self._author = None self._cover = None self._images = [] self.documents = {} self._unparsed_chapters = [] self.save_data = save_data self.dirty = False self.progress = save_data['progress'] if save_data else None self.progress_subscribers = []
def _parseChaptersContent(self, chapters): parsed_chapters = [] self.chapter_lookup = {} for i, chapter in enumerate(chapters): parsed_chapters.append(self.__parseChapterContent(chapter)) self.chapter_lookup[chapter.file_name.split('/')[-1]] = i return parsed_chapters def __parseChapterContent(self, chapter): raw = chapter.content # FIXME: This ugly workaround is the only way I found to make lxml # use the correct encoding when an lxml declaration is absent from # the document. Also note this causes lxml to get rid of html and # body tags for some reason, which may be a problem in future. declaration = """<?xml version="1.0" encoding="utf-8"?>""" tree = fromstring(bytes(declaration, 'utf-8') + raw) # Replace xml svg elements with valid html svg_elements = tree.xpath('//svg') if svg_elements: for svg in svg_elements: if not svg.xpath('//image'): continue image = svg.xpath('//image')[0] attrs = {item[0]: item[1] for item in image.items()} try: href = attrs['xlink:href'] del attrs['xlink:href'] attrs['src'] = href except AttributeError: pass proper_img = builder.IMG(**attrs) svg.getparent().replace(svg, proper_img) xhtml_to_html(tree) html = tostring(tree, method='xml', encoding='unicode') # Get rid of invisible garbage characters html = html.replace('\ufeff', '') links = tree.xpath('//a/@href') image_links = tree.xpath('//img/@src') images = [] for image_link in image_links: for image in self._images: if image_link.lstrip('./') in image.file_name: images.append({'item': image, 'link': image_link, 'raw': image.content}) # We to store the length of the plain text of all chapters for # progress-calculation purposes dummy_display = QTextBrowser() dummy_display.setHtml(html) plain = dummy_display.toPlainText() return {'html': html, 'plain': plain, 'len': len(plain), 'links': links, 'images': images} @property def chapters(self): if not self._unparsed_chapters: self._getItems(self._book) if not self._chapters: self._chapters = self._parseChaptersContent( self._unparsed_chapters) return self._chapters @property def images(self): if not self._images: self._getItems(self._book) return self._images @property def cover(self): if not self._cover: self._getItems(self._book) return self._cover def _getItems(self, book): logger.debug("_getItems called for '{}'".format(book.title)) # Reset lists self._images = [] self._unparsed_chapters = [] # Get items for item in book.get_items(): if type(item) is epub.EpubCover: self._cover = item if type(item) is epub.EpubImage: if 'cover' in item.id and not self._cover: self._cover = item self._images.append(item) if type(item) is epub.EpubHtml: self.documents[item.id] = item # Get chapters for item in book.spine: uid = item[0] if uid in self.documents.keys(): self._unparsed_chapters.append(self.documents[uid]) # Workaround to catch some edge cases where the cover is not marked but # is present on the first page if not self._cover: first_page = self.__parseChapterContent(self._unparsed_chapters[0]) if len(first_page['images']) == 1 and \ isspaceorempty(first_page['plain'], True): self._cover = first_page['images'][0]['item'] @property def author(self): book = self._book if not self._author: for namespace in book.metadata.keys(): data = book.metadata[namespace] for key, value in data.items(): if key == 'creator': self._author = value[0][0] return self._author
[docs] def updateProgress(self, progress): self.progress = progress for subscriber in self.progress_subscribers: subscriber(progress)