From d8a8122bdc247c96e4169733090a7177bbed2689 Mon Sep 17 00:00:00 2001 From: Juraj Oravec Date: Mon, 21 Dec 2020 03:03:32 +0100 Subject: [PATCH] Rewrite to a more modular style - Only one module is available for now: boxnovel.com - There a re still some problems but it is getting better Signed-off-by: Juraj Oravec --- chapter.py | 24 ++++++ ebook.py | 126 ++++++++++++++++++++++++++++ novelDownloader.py | 179 +++++++++++++--------------------------- parsers/boxnovel_com.py | 85 +++++++++++++++++++ 4 files changed, 292 insertions(+), 122 deletions(-) create mode 100644 chapter.py create mode 100644 ebook.py create mode 100644 parsers/boxnovel_com.py diff --git a/chapter.py b/chapter.py new file mode 100644 index 0000000..d691089 --- /dev/null +++ b/chapter.py @@ -0,0 +1,24 @@ +import re + + +class Chapter: + def __init__(self, title, text, number=0): + self.title = title + self.text = text + self.number = number + + def getHtml(self): + chapterHtml = """ + + {title} + + + +

{number} - {title}

+
+ {body} +
+ + """.format(number=self.number, title=self.title, body=self.text) + + return chapterHtml diff --git a/ebook.py b/ebook.py new file mode 100644 index 0000000..ec9dbd1 --- /dev/null +++ b/ebook.py @@ -0,0 +1,126 @@ +import os + +from typing import List +from ebooklib import epub +from chapter import Chapter + + +class Ebook: + def __init__(self, title : str, chapters : List[Chapter], author : str = "", + coverArt : str = ""): + self.title = title + self.chapters = chapters + self.author = author + self.coverArt = coverArt + + self.ebookChapters = list() + + def create(self): + self.ebook = epub.EpubBook() + self.ebook.set_identifier("ebook-%s" % self.title) + self.ebook.set_title(self.title) + self.ebook.set_language('en') + if self.author: + self.ebook.add_author(self.author) + + self.docStyle = epub.EpubItem( + uid="doc_style", + file_name="style/main.css", + media_type="text/css", + content=open("template/style.css").read() + ) + self.ebook.add_item(self.docStyle) + + self.cover() + self.addChapters() + + filename = "{basename}.epub".format(basename=self.title) + print("Saving to '%s'" % filename) + if os.path.exists(filename): + os.remove(filename) + epub.write_epub(filename, self.ebook, {}) + + def cover(self): + if not self.ebook: + return + + content = "

{title}

".format(title=self.title) + if self.author: + content += "Written by {author}".format(author=self.author) + + self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml') + self.intro_ch.add_item(self.docStyle) + self.intro_ch.content = """ + + Cover + + + + {content} + + + """.format(content=content) + self.ebook.add_item(self.intro_ch) + + if self.coverArt: + self.book_artwork = epub.EpubItem( + uid="book_artwork", + file_name="media/artwork.jpg", + media_type="image/jpeg", + content=open(self.coverArt).read() + ) + self.ebook.add_item(self.book_artwork) + + art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml') + art_ch.add_item(self.docStyle) + art_ch.add_item(self.book_artwork) + art_ch.content = """ + + Artwork + + + + Artwork + + + """ + self.ebook.add_item(art_ch) + + def addChapters(self): + if not self.ebook: + return + + if not self.chapters: + return + + for chapterData in self.chapters: + chapter = epub.EpubHtml( + title=chapterData.title, + file_name='Chapter-{number}.xhtml'.format(number=chapterData.number) + ) + chapter.add_item(self.docStyle) + chapter.content = chapterData.getHtml() + self.ebook.add_item(chapter) + self.ebookChapters.append(chapter) + + self.tableOfContent() + + def tableOfContent(self): + toc = list() + if self.coverArt: + toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork')) + if self.chapters: + toc.append(epub.Link('cover.xhtml', 'Cover', 'cover')) + toc.append((epub.Section('Chapters'), self.ebookChapters)) + + # Set the TOC + self.ebook.toc = tuple(toc) + # add navigation files + self.ebook.add_item(epub.EpubNcx()) + self.ebook.add_item(epub.EpubNav()) + + # Create spine + nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml') + nav_page.add_item(self.docStyle) + self.ebook.add_item(nav_page) + self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters diff --git a/novelDownloader.py b/novelDownloader.py index 3b37780..84a9873 100755 --- a/novelDownloader.py +++ b/novelDownloader.py @@ -4,149 +4,84 @@ Script to downloaded webpages, extract text and merge all of them together to create one ebook. """ -import errno -import os -import re +import argparse -from bs4 import BeautifulSoup -from ebooklib import epub -from newspaper import Article +from importlib import import_module +from urllib.parse import urlparse +from typing import List -# Book name -name = "Name" -# Base url, this is used as url = mainUrl + -mainUrl = "http://example.com/chapter-" -# Number of all chapter -numberOfChapters = 1 -# Start from 0 or 1 ? -fromZero = False +from chapter import Chapter +from ebook import Ebook -# numerical representation of start for script -start = 0 if fromZero else 1 +class NovelDownload: + def __init__(self, url): + self.url = url -re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE) + self.title : str = "" + self.author : str = "" -chapters = [] + self.chapters : List[Chapter] = list() + self.parser = None + if not self.loadModule(): + print("Url is not supported") -def download(number): - """Download webpage, extract main article, save result""" - url = "%s%d" % (mainUrl, number) + def loadModule(self): + url = urlparse(self.url) + moduleName = url.netloc.replace(".", "_") - article = Article(url) - article.download() - article.parse() + # import parser + try: + parserPackage = import_module('parsers.' + moduleName) + except ImportError: + print("Parser module not found: " + moduleName) + return False + # return getattr(module, name) - chapterText = "%s - %d" % (name, number) - header = False + self.parser = parserPackage.Parser(self.url) - for match in re_paragraph.finditer(article.text): - paragraph = match.group() - paragraph = paragraph.strip() + return True - if paragraph != "Previous ChapterNext Chapter": - if not header: - chapterText += "

%s

" % (paragraph) - header = True - else: - chapterText += "

%s

\n" % (paragraph) + def download(self): + if not self.parser: + print("There was an error") + return - chapterHtml = BeautifulSoup( - """ - - ... - - - - """, - 'lxml' - ) - chapterHtml.head.title.string = article.title - chapterHtml.body.append(chapterText) + self.parser.prepare() + self.author = self.parser.getAuthor() + self.title = self.parser.getTitle() - return str(chapterText) + for x in range(1, len(self.parser.getUrls()), 1): + number, title, content = self.parser.nextChapter() + if number == -1: + number = x + print("{number} - {title}".format(number=number, title=title)) + chapter = Chapter(title, content, number) + self.chapters.append(chapter) - -def packageEbook(): - ebook = epub.EpubBook() - ebook.set_identifier("ebook-%s" % name) - ebook.set_title(name) - ebook.set_language('en') - doc_style = epub.EpubItem( - uid="doc_style", - file_name="style/main.css", - media_type="text/css", - content=open("template/style.css").read() - ) - ebook.add_item(doc_style) - - intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml') - intro_ch.add_item(doc_style) - intro_ch.content = """ - - - Introduction - - - -

%s

- - - """ % (name) - ebook.add_item(intro_ch) - - ebookChapters = [] - - i = start - for chapter_data in chapters: - chapter = epub.EpubHtml( - title="%s - %d" % (name, i), - file_name='%s-%d.xhtml' % (name, i) - ) - chapter.add_item(doc_style) - chapter.content = chapter_data - ebook.add_item(chapter) - ebookChapters.append(chapter) - - i += 1 - - # Set the TOC - ebook.toc = ( - epub.Link('intro.xhtml', 'Introduction', 'intro'), - (epub.Section('Chapters'), ebookChapters) - ) - # add navigation files - ebook.add_item(epub.EpubNcx()) - ebook.add_item(epub.EpubNav()) - - # Create spine - nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml') - nav_page.add_item(doc_style) - ebook.add_item(nav_page) - ebook.spine = [intro_ch, nav_page] + ebookChapters - - filename = '%s.epub' % (name) - print("Saving to '%s'" % filename) - if os.path.exists(filename): - os.remove(filename) - epub.write_epub(filename, ebook, {}) + def save(self): + book = Ebook(self.title, self.chapters, self.author) + book.create() def main(): - """Start main downloader and converter""" + parser = argparse.ArgumentParser(description='Webnovel downloader') + parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True, + help='Url of the index page or first chapter, depends on parser support.') + parser.add_argument('--version', action='version', version='%(prog)s 1.0.0') + args = parser.parse_args() - # Download all chapters one by one - print("Downloading...") - for i in range(start, numberOfChapters + 1, 1): - print("Downloading: ", name, i) - chapters.append(download(i)) + nd = NovelDownload(args.url[0]) - packageEbook() - - print("Done") + # Download all chapters one by one + print("Downloading...") + nd.download() + print("Saving...") + nd.save() + print("Done") # if yused standalone start the script if __name__ == '__main__': - main() + main() diff --git a/parsers/boxnovel_com.py b/parsers/boxnovel_com.py new file mode 100644 index 0000000..47f237c --- /dev/null +++ b/parsers/boxnovel_com.py @@ -0,0 +1,85 @@ +import re +import requests +from bs4 import BeautifulSoup +from itertools import cycle +from typing import List + + +class Parser: + def __init__(self, url : str): + self.indexUrl : str = url + + self.chapterUrls : List[str] = list() + self.indexPage : BeautifulSoup = None + self.chapterCycle : cycle = None + + def prepare(self): + self.downloadIndex() + self.parseIndex() + + def nextChapter(self): + url = next(self.chapterCycle) + + response = requests.get(url) + if response.status_code != 200: + raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl)) + page = BeautifulSoup(response.text, "lxml") + + # Extract chapter + content = page.find("div", {"class": "reading-content"}) + if content is None: + raise RuntimeError('Failed to find a chapter content
') + + # Get title + bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True) + titleText = bcol.findChild("li", {"class": "active"}, recursive=True) + titleText = titleText.text + + try: + number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0] + title = title.strip() + except Exception: + number = -1 + title = titleText + + self.decompose(content.findChildren("script", recursive=True)) + self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True)) + self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True)) + + return number, title.strip(), content.contents[1] + + def getAuthor(self): + author = self.indexPage.find("div", {"class": "author-content"}) + return author.text.strip() + + def getTitle(self): + title = self.indexPage.find("div", {"class": "post-title"}) + return title.text.strip() + + def getUrls(self): + return self.chapterUrls + + def decompose(self, objects): + for obj in objects: + obj.decompose() + + def downloadIndex(self): + response = requests.get(self.indexUrl) + if response.status_code != 200: + raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl)) + + self.indexPage = BeautifulSoup(response.text, "lxml") + + def parseIndex(self): + chapterListTag = self.indexPage.find("ul", {"class": "version-chap"}) + if chapterListTag is None: + raise RuntimeError('Failed to find a chapter list