4 changed files with 122 additions and 292 deletions
--- a/chapter.py
+++ b/chapter.py
@ -1,24 +0,0 @@
-import re
-
-
-class Chapter:
-    def __init__(self, title, text, number=0):
-        self.title = title
-        self.text = text
-        self.number = number
-
-    def getHtml(self):
-        chapterHtml = """<html>
-            <head>
-                <title>{title}</title>
-                <link rel="stylesheet" type="text/css" href="style/main.css" />
-            </head>
-            <body>
-                <h3>{number} - {title}</h3>
-                <div class="ebook_chapter_content">
-                    {body}
-                </div>
-            </body>
-            </html>""".format(number=self.number, title=self.title, body=self.text)
-
-        return chapterHtml
--- a/ebook.py
+++ b/ebook.py
@ -1,126 +0,0 @@
-import os
-
-from typing import List
-from ebooklib import epub
-from chapter import Chapter
-
-
-class Ebook:
-    def __init__(self, title : str, chapters : List[Chapter], author : str = "",
-                 coverArt : str = ""):
-        self.title = title
-        self.chapters = chapters
-        self.author = author
-        self.coverArt = coverArt
-
-        self.ebookChapters = list()
-
-    def create(self):
-        self.ebook = epub.EpubBook()
-        self.ebook.set_identifier("ebook-%s" % self.title)
-        self.ebook.set_title(self.title)
-        self.ebook.set_language('en')
-        if self.author:
-            self.ebook.add_author(self.author)
-
-        self.docStyle = epub.EpubItem(
-            uid="doc_style",
-            file_name="style/main.css",
-            media_type="text/css",
-            content=open("template/style.css").read()
-        )
-        self.ebook.add_item(self.docStyle)
-
-        self.cover()
-        self.addChapters()
-
-        filename = "{basename}.epub".format(basename=self.title)
-        print("Saving to '%s'" % filename)
-        if os.path.exists(filename):
-            os.remove(filename)
-        epub.write_epub(filename, self.ebook, {})
-
-    def cover(self):
-        if not self.ebook:
-            return
-
-        content = "<h1>{title}</h1>".format(title=self.title)
-        if self.author:
-            content += "Written by {author}".format(author=self.author)
-
-        self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
-        self.intro_ch.add_item(self.docStyle)
-        self.intro_ch.content = """<html>
-        <head>
-            <title>Cover</title>
-            <link rel="stylesheet" href="style/main.css" type="text/css" />
-        </head>
-        <body>
-            {content}
-        </body>
-        </html>
-        """.format(content=content)
-        self.ebook.add_item(self.intro_ch)
-
-        if self.coverArt:
-            self.book_artwork = epub.EpubItem(
-                uid="book_artwork",
-                file_name="media/artwork.jpg",
-                media_type="image/jpeg",
-                content=open(self.coverArt).read()
-            )
-            self.ebook.add_item(self.book_artwork)
-
-            art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
-            art_ch.add_item(self.docStyle)
-            art_ch.add_item(self.book_artwork)
-            art_ch.content = """<html>
-            <head>
-                <title>Artwork</title>
-                <link rel="stylesheet" href="style/main.css" type="text/css" />
-            </head>
-            <body>
-                <img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
-            </body>
-            </html>
-            """
-            self.ebook.add_item(art_ch)
-
-    def addChapters(self):
-        if not self.ebook:
-            return
-
-        if not self.chapters:
-            return
-
-        for chapterData in self.chapters:
-            chapter = epub.EpubHtml(
-                title=chapterData.title,
-                file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
-            )
-            chapter.add_item(self.docStyle)
-            chapter.content = chapterData.getHtml()
-            self.ebook.add_item(chapter)
-            self.ebookChapters.append(chapter)
-
-        self.tableOfContent()
-
-    def tableOfContent(self):
-        toc = list()
-        if self.coverArt:
-            toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
-        if self.chapters:
-            toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
-        toc.append((epub.Section('Chapters'), self.ebookChapters))
-
-        # Set the TOC
-        self.ebook.toc = tuple(toc)
-        # add navigation files
-        self.ebook.add_item(epub.EpubNcx())
-        self.ebook.add_item(epub.EpubNav())
-
-        # Create spine
-        nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
-        nav_page.add_item(self.docStyle)
-        self.ebook.add_item(nav_page)
-        self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters
--- a/novelDownloader.py
+++ b/novelDownloader.py
@ -4,84 +4,149 @@ Script to downloaded webpages, extract text and merge all of them
 together to create one ebook.
 """

-import argparse
+import errno
+import os
+import re

-from importlib import import_module
-from urllib.parse import urlparse
-from typing import List
+from bs4 import BeautifulSoup
+from ebooklib import epub
+from newspaper import Article

-from chapter import Chapter
-from ebook import Ebook
+# Book name
+name = "Name"
+# Base url, this is used as url = mainUrl + <number of chapter>
+mainUrl = "http://example.com/chapter-"
+# Number of all chapter
+numberOfChapters = 1
+# Start from 0 or 1 ?
+fromZero = False


-class NovelDownload:
-    def __init__(self, url):
-        self.url = url
+# numerical representation of start for script
+start = 0 if fromZero else 1

-        self.title : str = ""
-        self.author : str = ""
+re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)

-        self.chapters : List[Chapter] = list()
-        self.parser = None
+chapters = []

-        if not self.loadModule():
-            print("Url is not supported")

-    def loadModule(self):
-        url = urlparse(self.url)
-        moduleName = url.netloc.replace(".", "_")
+def download(number):
+	"""Download webpage, extract main article, save result"""
+	url = "%s%d" % (mainUrl, number)

-        # import parser
-        try:
-            parserPackage = import_module('parsers.' + moduleName)
-        except ImportError:
-            print("Parser module not found: " + moduleName)
-            return False
-        # return getattr(module, name)
+	article = Article(url)
+	article.download()
+	article.parse()

-        self.parser = parserPackage.Parser(self.url)
+	chapterText = "%s - %d" % (name, number)
+	header = False

-        return True
+	for match in re_paragraph.finditer(article.text):
+		paragraph = match.group()
+		paragraph = paragraph.strip()

-    def download(self):
-        if not self.parser:
-            print("There was an error")
-            return
+		if paragraph != "Previous ChapterNext Chapter":
+			if not header:
+				chapterText += "<h2>%s</h2>" % (paragraph)
+				header = True
+			else:
+				chapterText += "<p>%s</p>\n" % (paragraph)

-        self.parser.prepare()
-        self.author = self.parser.getAuthor()
-        self.title = self.parser.getTitle()
+	chapterHtml = BeautifulSoup(
+		"""<html>
+		<head>
+				<title>...</title>
+				<link rel="stylesheet" type="text/css" href="style/main.css" />
+		</head>
+		<body></body>
+		</html>""",
+		'lxml'
+	)
+	chapterHtml.head.title.string = article.title
+	chapterHtml.body.append(chapterText)

-        for x in range(1, len(self.parser.getUrls()), 1):
-            number, title, content = self.parser.nextChapter()
-            if number == -1:
-                number = x
-            print("{number} - {title}".format(number=number, title=title))
-            chapter = Chapter(title, content, number)
-            self.chapters.append(chapter)
+	return str(chapterText)

-    def save(self):
-        book = Ebook(self.title, self.chapters, self.author)
-        book.create()
+
+def packageEbook():
+	ebook = epub.EpubBook()
+	ebook.set_identifier("ebook-%s" % name)
+	ebook.set_title(name)
+	ebook.set_language('en')
+	doc_style = epub.EpubItem(
+		uid="doc_style",
+		file_name="style/main.css",
+		media_type="text/css",
+		content=open("template/style.css").read()
+	)
+	ebook.add_item(doc_style)
+
+	intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
+	intro_ch.add_item(doc_style)
+	intro_ch.content = """
+	<html>
+	<head>
+			<title>Introduction</title>
+			<link rel="stylesheet" href="style/main.css" type="text/css" />
+	</head>
+	<body>
+			<h1>%s</h1>
+	</body>
+	</html>
+	""" % (name)
+	ebook.add_item(intro_ch)
+
+	ebookChapters = []
+
+	i = start
+	for chapter_data in chapters:
+		chapter = epub.EpubHtml(
+			title="%s - %d" % (name, i),
+			file_name='%s-%d.xhtml' % (name, i)
+		)
+		chapter.add_item(doc_style)
+		chapter.content = chapter_data
+		ebook.add_item(chapter)
+		ebookChapters.append(chapter)
+
+		i += 1
+
+	# Set the TOC
+	ebook.toc = (
+		epub.Link('intro.xhtml', 'Introduction', 'intro'),
+		(epub.Section('Chapters'), ebookChapters)
+	)
+	# add navigation files
+	ebook.add_item(epub.EpubNcx())
+	ebook.add_item(epub.EpubNav())
+
+	# Create spine
+	nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
+	nav_page.add_item(doc_style)
+	ebook.add_item(nav_page)
+	ebook.spine = [intro_ch, nav_page] + ebookChapters
+
+	filename = '%s.epub' % (name)
+	print("Saving to '%s'" % filename)
+	if os.path.exists(filename):
+			os.remove(filename)
+	epub.write_epub(filename, ebook, {})


 def main():
-    parser = argparse.ArgumentParser(description='Webnovel downloader')
-    parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
-                        help='Url of the index page or first chapter, depends on parser support.')
-    parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
-    args = parser.parse_args()
+	"""Start main downloader and converter"""

-    nd = NovelDownload(args.url[0])
+	# Download all chapters one by one
+	print("Downloading...")
+	for i in range(start, numberOfChapters + 1, 1):
+		print("Downloading: ", name, i)
+		chapters.append(download(i))

-    # Download all chapters one by one
-    print("Downloading...")
-    nd.download()
-    print("Saving...")
-    nd.save()
-    print("Done")
+	packageEbook()
+
+	print("Done")


 # if yused standalone start the script
 if __name__ == '__main__':
-    main()
+	main()
--- a/parsers/boxnovel_com.py
+++ b/parsers/boxnovel_com.py
@ -1,85 +0,0 @@
-import re
-import requests
-from bs4 import BeautifulSoup
-from itertools import cycle
-from typing import List
-
-
-class Parser:
-    def __init__(self, url : str):
-        self.indexUrl : str = url
-
-        self.chapterUrls : List[str] = list()
-        self.indexPage : BeautifulSoup = None
-        self.chapterCycle : cycle = None
-
-    def prepare(self):
-        self.downloadIndex()
-        self.parseIndex()
-
-    def nextChapter(self):
-        url = next(self.chapterCycle)
-
-        response = requests.get(url)
-        if response.status_code != 200:
-            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
-        page = BeautifulSoup(response.text, "lxml")
-
-        # Extract chapter
-        content = page.find("div", {"class": "reading-content"})
-        if content is None:
-            raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
-
-        # Get title
-        bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
-        titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
-        titleText = titleText.text
-
-        try:
-            number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
-            title = title.strip()
-        except Exception:
-            number = -1
-            title = titleText
-
-        self.decompose(content.findChildren("script", recursive=True))
-        self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
-        self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
-
-        return number, title.strip(), content.contents[1]
-
-    def getAuthor(self):
-        author = self.indexPage.find("div", {"class": "author-content"})
-        return author.text.strip()
-
-    def getTitle(self):
-        title = self.indexPage.find("div", {"class": "post-title"})
-        return title.text.strip()
-
-    def getUrls(self):
-        return self.chapterUrls
-
-    def decompose(self, objects):
-        for obj in objects:
-            obj.decompose()
-
-    def downloadIndex(self):
-        response = requests.get(self.indexUrl)
-        if response.status_code != 200:
-            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
-
-        self.indexPage = BeautifulSoup(response.text, "lxml")
-
-    def parseIndex(self):
-        chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
-        if chapterListTag is None:
-            raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
-
-        chapters = chapterListTag.findChildren("a", recursive=True)
-        if chapters is None:
-            raise RuntimeError('Failed to find a links to chapters <a>')
-
-        for chapter in chapters:
-            self.chapterUrls.append(chapter.get('href'))
-        self.chapterUrls.reverse()
-        self.chapterCycle = cycle(self.chapterUrls)