Rewrite to a more modular style

- Only one module is available for now: boxnovel.com - There a re still some problems but it is getting better Signed-off-by: Juraj Oravec <jurajoravec@mailo.com>
2020-12-21 03:03:32 +01:00 · 2020-12-21 03:03:32 +01:00 · d8a8122bdc
commit d8a8122bdc
parent 5c1297cc5a
4 changed files with 292 additions and 122 deletions
--- a/chapter.py
+++ b/chapter.py
@ -0,0 +1,24 @@
+import re
+
+
+class Chapter:
+    def __init__(self, title, text, number=0):
+        self.title = title
+        self.text = text
+        self.number = number
+
+    def getHtml(self):
+        chapterHtml = """<html>
+            <head>
+                <title>{title}</title>
+                <link rel="stylesheet" type="text/css" href="style/main.css" />
+            </head>
+            <body>
+                <h3>{number} - {title}</h3>
+                <div class="ebook_chapter_content">
+                    {body}
+                </div>
+            </body>
+            </html>""".format(number=self.number, title=self.title, body=self.text)
+
+        return chapterHtml
--- a/ebook.py
+++ b/ebook.py
@ -0,0 +1,126 @@
+import os
+
+from typing import List
+from ebooklib import epub
+from chapter import Chapter
+
+
+class Ebook:
+    def __init__(self, title : str, chapters : List[Chapter], author : str = "",
+                 coverArt : str = ""):
+        self.title = title
+        self.chapters = chapters
+        self.author = author
+        self.coverArt = coverArt
+
+        self.ebookChapters = list()
+
+    def create(self):
+        self.ebook = epub.EpubBook()
+        self.ebook.set_identifier("ebook-%s" % self.title)
+        self.ebook.set_title(self.title)
+        self.ebook.set_language('en')
+        if self.author:
+            self.ebook.add_author(self.author)
+
+        self.docStyle = epub.EpubItem(
+            uid="doc_style",
+            file_name="style/main.css",
+            media_type="text/css",
+            content=open("template/style.css").read()
+        )
+        self.ebook.add_item(self.docStyle)
+
+        self.cover()
+        self.addChapters()
+
+        filename = "{basename}.epub".format(basename=self.title)
+        print("Saving to '%s'" % filename)
+        if os.path.exists(filename):
+            os.remove(filename)
+        epub.write_epub(filename, self.ebook, {})
+
+    def cover(self):
+        if not self.ebook:
+            return
+
+        content = "<h1>{title}</h1>".format(title=self.title)
+        if self.author:
+            content += "Written by {author}".format(author=self.author)
+
+        self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
+        self.intro_ch.add_item(self.docStyle)
+        self.intro_ch.content = """<html>
+        <head>
+            <title>Cover</title>
+            <link rel="stylesheet" href="style/main.css" type="text/css" />
+        </head>
+        <body>
+            {content}
+        </body>
+        </html>
+        """.format(content=content)
+        self.ebook.add_item(self.intro_ch)
+
+        if self.coverArt:
+            self.book_artwork = epub.EpubItem(
+                uid="book_artwork",
+                file_name="media/artwork.jpg",
+                media_type="image/jpeg",
+                content=open(self.coverArt).read()
+            )
+            self.ebook.add_item(self.book_artwork)
+
+            art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
+            art_ch.add_item(self.docStyle)
+            art_ch.add_item(self.book_artwork)
+            art_ch.content = """<html>
+            <head>
+                <title>Artwork</title>
+                <link rel="stylesheet" href="style/main.css" type="text/css" />
+            </head>
+            <body>
+                <img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
+            </body>
+            </html>
+            """
+            self.ebook.add_item(art_ch)
+
+    def addChapters(self):
+        if not self.ebook:
+            return
+
+        if not self.chapters:
+            return
+
+        for chapterData in self.chapters:
+            chapter = epub.EpubHtml(
+                title=chapterData.title,
+                file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
+            )
+            chapter.add_item(self.docStyle)
+            chapter.content = chapterData.getHtml()
+            self.ebook.add_item(chapter)
+            self.ebookChapters.append(chapter)
+
+        self.tableOfContent()
+
+    def tableOfContent(self):
+        toc = list()
+        if self.coverArt:
+            toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
+        if self.chapters:
+            toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
+        toc.append((epub.Section('Chapters'), self.ebookChapters))
+
+        # Set the TOC
+        self.ebook.toc = tuple(toc)
+        # add navigation files
+        self.ebook.add_item(epub.EpubNcx())
+        self.ebook.add_item(epub.EpubNav())
+
+        # Create spine
+        nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
+        nav_page.add_item(self.docStyle)
+        self.ebook.add_item(nav_page)
+        self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters
--- a/novelDownloader.py
+++ b/novelDownloader.py
@ -4,149 +4,84 @@ Script to downloaded webpages, extract text and merge all of them
 together to create one ebook.
 """

-import errno
-import os
-import re
+import argparse

-from bs4 import BeautifulSoup
-from ebooklib import epub
-from newspaper import Article
+from importlib import import_module
+from urllib.parse import urlparse
+from typing import List

-# Book name
-name = "Name"
-# Base url, this is used as url = mainUrl + <number of chapter>
-mainUrl = "http://example.com/chapter-"
-# Number of all chapter
-numberOfChapters = 1
-# Start from 0 or 1 ?
-fromZero = False
+from chapter import Chapter
+from ebook import Ebook


-# numerical representation of start for script
-start = 0 if fromZero else 1
+class NovelDownload:
+    def __init__(self, url):
+        self.url = url

-re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
+        self.title : str = ""
+        self.author : str = ""

-chapters = []
+        self.chapters : List[Chapter] = list()
+        self.parser = None

+        if not self.loadModule():
+            print("Url is not supported")

-def download(number):
-	"""Download webpage, extract main article, save result"""
-	url = "%s%d" % (mainUrl, number)
+    def loadModule(self):
+        url = urlparse(self.url)
+        moduleName = url.netloc.replace(".", "_")

-	article = Article(url)
-	article.download()
-	article.parse()
+        # import parser
+        try:
+            parserPackage = import_module('parsers.' + moduleName)
+        except ImportError:
+            print("Parser module not found: " + moduleName)
+            return False
+        # return getattr(module, name)

-	chapterText = "%s - %d" % (name, number)
-	header = False
+        self.parser = parserPackage.Parser(self.url)

-	for match in re_paragraph.finditer(article.text):
-		paragraph = match.group()
-		paragraph = paragraph.strip()
+        return True

-		if paragraph != "Previous ChapterNext Chapter":
-			if not header:
-				chapterText += "<h2>%s</h2>" % (paragraph)
-				header = True
-			else:
-				chapterText += "<p>%s</p>\n" % (paragraph)
+    def download(self):
+        if not self.parser:
+            print("There was an error")
+            return

-	chapterHtml = BeautifulSoup(
-		"""<html>
-		<head>
-				<title>...</title>
-				<link rel="stylesheet" type="text/css" href="style/main.css" />
-		</head>
-		<body></body>
-		</html>""",
-		'lxml'
-	)
-	chapterHtml.head.title.string = article.title
-	chapterHtml.body.append(chapterText)
+        self.parser.prepare()
+        self.author = self.parser.getAuthor()
+        self.title = self.parser.getTitle()

-	return str(chapterText)
+        for x in range(1, len(self.parser.getUrls()), 1):
+            number, title, content = self.parser.nextChapter()
+            if number == -1:
+                number = x
+            print("{number} - {title}".format(number=number, title=title))
+            chapter = Chapter(title, content, number)
+            self.chapters.append(chapter)

-
-def packageEbook():
-	ebook = epub.EpubBook()
-	ebook.set_identifier("ebook-%s" % name)
-	ebook.set_title(name)
-	ebook.set_language('en')
-	doc_style = epub.EpubItem(
-		uid="doc_style",
-		file_name="style/main.css",
-		media_type="text/css",
-		content=open("template/style.css").read()
-	)
-	ebook.add_item(doc_style)
-
-	intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
-	intro_ch.add_item(doc_style)
-	intro_ch.content = """
-	<html>
-	<head>
-			<title>Introduction</title>
-			<link rel="stylesheet" href="style/main.css" type="text/css" />
-	</head>
-	<body>
-			<h1>%s</h1>
-	</body>
-	</html>
-	""" % (name)
-	ebook.add_item(intro_ch)
-
-	ebookChapters = []
-
-	i = start
-	for chapter_data in chapters:
-		chapter = epub.EpubHtml(
-			title="%s - %d" % (name, i),
-			file_name='%s-%d.xhtml' % (name, i)
-		)
-		chapter.add_item(doc_style)
-		chapter.content = chapter_data
-		ebook.add_item(chapter)
-		ebookChapters.append(chapter)
-
-		i += 1
-
-	# Set the TOC
-	ebook.toc = (
-		epub.Link('intro.xhtml', 'Introduction', 'intro'),
-		(epub.Section('Chapters'), ebookChapters)
-	)
-	# add navigation files
-	ebook.add_item(epub.EpubNcx())
-	ebook.add_item(epub.EpubNav())
-
-	# Create spine
-	nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
-	nav_page.add_item(doc_style)
-	ebook.add_item(nav_page)
-	ebook.spine = [intro_ch, nav_page] + ebookChapters
-
-	filename = '%s.epub' % (name)
-	print("Saving to '%s'" % filename)
-	if os.path.exists(filename):
-			os.remove(filename)
-	epub.write_epub(filename, ebook, {})
+    def save(self):
+        book = Ebook(self.title, self.chapters, self.author)
+        book.create()


 def main():
-	"""Start main downloader and converter"""
+    parser = argparse.ArgumentParser(description='Webnovel downloader')
+    parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
+                        help='Url of the index page or first chapter, depends on parser support.')
+    parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
+    args = parser.parse_args()

-	# Download all chapters one by one
-	print("Downloading...")
-	for i in range(start, numberOfChapters + 1, 1):
-		print("Downloading: ", name, i)
-		chapters.append(download(i))
+    nd = NovelDownload(args.url[0])

-	packageEbook()
-
-	print("Done")
+    # Download all chapters one by one
+    print("Downloading...")
+    nd.download()
+    print("Saving...")
+    nd.save()
+    print("Done")


 # if yused standalone start the script
 if __name__ == '__main__':
-	main()
+    main()
--- a/parsers/boxnovel_com.py
+++ b/parsers/boxnovel_com.py
@ -0,0 +1,85 @@
+import re
+import requests
+from bs4 import BeautifulSoup
+from itertools import cycle
+from typing import List
+
+
+class Parser:
+    def __init__(self, url : str):
+        self.indexUrl : str = url
+
+        self.chapterUrls : List[str] = list()
+        self.indexPage : BeautifulSoup = None
+        self.chapterCycle : cycle = None
+
+    def prepare(self):
+        self.downloadIndex()
+        self.parseIndex()
+
+    def nextChapter(self):
+        url = next(self.chapterCycle)
+
+        response = requests.get(url)
+        if response.status_code != 200:
+            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
+        page = BeautifulSoup(response.text, "lxml")
+
+        # Extract chapter
+        content = page.find("div", {"class": "reading-content"})
+        if content is None:
+            raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
+
+        # Get title
+        bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
+        titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
+        titleText = titleText.text
+
+        try:
+            number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
+            title = title.strip()
+        except Exception:
+            number = -1
+            title = titleText
+
+        self.decompose(content.findChildren("script", recursive=True))
+        self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
+        self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
+
+        return number, title.strip(), content.contents[1]
+
+    def getAuthor(self):
+        author = self.indexPage.find("div", {"class": "author-content"})
+        return author.text.strip()
+
+    def getTitle(self):
+        title = self.indexPage.find("div", {"class": "post-title"})
+        return title.text.strip()
+
+    def getUrls(self):
+        return self.chapterUrls
+
+    def decompose(self, objects):
+        for obj in objects:
+            obj.decompose()
+
+    def downloadIndex(self):
+        response = requests.get(self.indexUrl)
+        if response.status_code != 200:
+            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
+
+        self.indexPage = BeautifulSoup(response.text, "lxml")
+
+    def parseIndex(self):
+        chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
+        if chapterListTag is None:
+            raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
+
+        chapters = chapterListTag.findChildren("a", recursive=True)
+        if chapters is None:
+            raise RuntimeError('Failed to find a links to chapters <a>')
+
+        for chapter in chapters:
+            self.chapterUrls.append(chapter.get('href'))
+        self.chapterUrls.reverse()
+        self.chapterCycle = cycle(self.chapterUrls)