Rewrite to a more modular style

- Only one module is available for now: boxnovel.com
- There a re still some problems but it is getting better

Signed-off-by: Juraj Oravec <jurajoravec@mailo.com>
This commit is contained in:
Juraj Oravec 2020-12-21 03:03:32 +01:00
parent 5c1297cc5a
commit d8a8122bdc
Signed by: SGOrava
GPG Key ID: 13660A3F1D9F093B
4 changed files with 292 additions and 122 deletions

24
chapter.py Normal file
View File

@ -0,0 +1,24 @@
import re
class Chapter:
def __init__(self, title, text, number=0):
self.title = title
self.text = text
self.number = number
def getHtml(self):
chapterHtml = """<html>
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body>
<h3>{number} - {title}</h3>
<div class="ebook_chapter_content">
{body}
</div>
</body>
</html>""".format(number=self.number, title=self.title, body=self.text)
return chapterHtml

126
ebook.py Normal file
View File

@ -0,0 +1,126 @@
import os
from typing import List
from ebooklib import epub
from chapter import Chapter
class Ebook:
def __init__(self, title : str, chapters : List[Chapter], author : str = "",
coverArt : str = ""):
self.title = title
self.chapters = chapters
self.author = author
self.coverArt = coverArt
self.ebookChapters = list()
def create(self):
self.ebook = epub.EpubBook()
self.ebook.set_identifier("ebook-%s" % self.title)
self.ebook.set_title(self.title)
self.ebook.set_language('en')
if self.author:
self.ebook.add_author(self.author)
self.docStyle = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
self.ebook.add_item(self.docStyle)
self.cover()
self.addChapters()
filename = "{basename}.epub".format(basename=self.title)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, self.ebook, {})
def cover(self):
if not self.ebook:
return
content = "<h1>{title}</h1>".format(title=self.title)
if self.author:
content += "Written by {author}".format(author=self.author)
self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
self.intro_ch.add_item(self.docStyle)
self.intro_ch.content = """<html>
<head>
<title>Cover</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
{content}
</body>
</html>
""".format(content=content)
self.ebook.add_item(self.intro_ch)
if self.coverArt:
self.book_artwork = epub.EpubItem(
uid="book_artwork",
file_name="media/artwork.jpg",
media_type="image/jpeg",
content=open(self.coverArt).read()
)
self.ebook.add_item(self.book_artwork)
art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
art_ch.add_item(self.docStyle)
art_ch.add_item(self.book_artwork)
art_ch.content = """<html>
<head>
<title>Artwork</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
</body>
</html>
"""
self.ebook.add_item(art_ch)
def addChapters(self):
if not self.ebook:
return
if not self.chapters:
return
for chapterData in self.chapters:
chapter = epub.EpubHtml(
title=chapterData.title,
file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
)
chapter.add_item(self.docStyle)
chapter.content = chapterData.getHtml()
self.ebook.add_item(chapter)
self.ebookChapters.append(chapter)
self.tableOfContent()
def tableOfContent(self):
toc = list()
if self.coverArt:
toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
if self.chapters:
toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
toc.append((epub.Section('Chapters'), self.ebookChapters))
# Set the TOC
self.ebook.toc = tuple(toc)
# add navigation files
self.ebook.add_item(epub.EpubNcx())
self.ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(self.docStyle)
self.ebook.add_item(nav_page)
self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters

View File

@ -4,146 +4,81 @@ Script to downloaded webpages, extract text and merge all of them
together to create one ebook. together to create one ebook.
""" """
import errno import argparse
import os
import re
from bs4 import BeautifulSoup from importlib import import_module
from ebooklib import epub from urllib.parse import urlparse
from newspaper import Article from typing import List
# Book name from chapter import Chapter
name = "Name" from ebook import Ebook
# Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-"
# Number of all chapter
numberOfChapters = 1
# Start from 0 or 1 ?
fromZero = False
# numerical representation of start for script class NovelDownload:
start = 0 if fromZero else 1 def __init__(self, url):
self.url = url
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE) self.title : str = ""
self.author : str = ""
chapters = [] self.chapters : List[Chapter] = list()
self.parser = None
if not self.loadModule():
print("Url is not supported")
def download(number): def loadModule(self):
"""Download webpage, extract main article, save result""" url = urlparse(self.url)
url = "%s%d" % (mainUrl, number) moduleName = url.netloc.replace(".", "_")
article = Article(url) # import parser
article.download() try:
article.parse() parserPackage = import_module('parsers.' + moduleName)
except ImportError:
print("Parser module not found: " + moduleName)
return False
# return getattr(module, name)
chapterText = "%s - %d" % (name, number) self.parser = parserPackage.Parser(self.url)
header = False
for match in re_paragraph.finditer(article.text): return True
paragraph = match.group()
paragraph = paragraph.strip()
if paragraph != "Previous ChapterNext Chapter": def download(self):
if not header: if not self.parser:
chapterText += "<h2>%s</h2>" % (paragraph) print("There was an error")
header = True return
else:
chapterText += "<p>%s</p>\n" % (paragraph)
chapterHtml = BeautifulSoup( self.parser.prepare()
"""<html> self.author = self.parser.getAuthor()
<head> self.title = self.parser.getTitle()
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
return str(chapterText) for x in range(1, len(self.parser.getUrls()), 1):
number, title, content = self.parser.nextChapter()
if number == -1:
number = x
print("{number} - {title}".format(number=number, title=title))
chapter = Chapter(title, content, number)
self.chapters.append(chapter)
def save(self):
def packageEbook(): book = Ebook(self.title, self.chapters, self.author)
ebook = epub.EpubBook() book.create()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def main(): def main():
"""Start main downloader and converter""" parser = argparse.ArgumentParser(description='Webnovel downloader')
parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
help='Url of the index page or first chapter, depends on parser support.')
parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
args = parser.parse_args()
nd = NovelDownload(args.url[0])
# Download all chapters one by one # Download all chapters one by one
print("Downloading...") print("Downloading...")
for i in range(start, numberOfChapters + 1, 1): nd.download()
print("Downloading: ", name, i) print("Saving...")
chapters.append(download(i)) nd.save()
packageEbook()
print("Done") print("Done")

85
parsers/boxnovel_com.py Normal file
View File

@ -0,0 +1,85 @@
import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List
class Parser:
def __init__(self, url : str):
self.indexUrl : str = url
self.chapterUrls : List[str] = list()
self.indexPage : BeautifulSoup = None
self.chapterCycle : cycle = None
def prepare(self):
self.downloadIndex()
self.parseIndex()
def nextChapter(self):
url = next(self.chapterCycle)
response = requests.get(url)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
page = BeautifulSoup(response.text, "lxml")
# Extract chapter
content = page.find("div", {"class": "reading-content"})
if content is None:
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
# Get title
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
titleText = titleText.text
try:
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
title = title.strip()
except Exception:
number = -1
title = titleText
self.decompose(content.findChildren("script", recursive=True))
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
return number, title.strip(), content.contents[1]
def getAuthor(self):
author = self.indexPage.find("div", {"class": "author-content"})
return author.text.strip()
def getTitle(self):
title = self.indexPage.find("div", {"class": "post-title"})
return title.text.strip()
def getUrls(self):
return self.chapterUrls
def decompose(self, objects):
for obj in objects:
obj.decompose()
def downloadIndex(self):
response = requests.get(self.indexUrl)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
self.indexPage = BeautifulSoup(response.text, "lxml")
def parseIndex(self):
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
if chapterListTag is None:
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
chapters = chapterListTag.findChildren("a", recursive=True)
if chapters is None:
raise RuntimeError('Failed to find a links to chapters <a>')
for chapter in chapters:
self.chapterUrls.append(chapter.get('href'))
self.chapterUrls.reverse()
self.chapterCycle = cycle(self.chapterUrls)