Rewrite to a more modular style

- Only one module is available for now: boxnovel.com
- There a re still some problems but it is getting better

Signed-off-by: Juraj Oravec <jurajoravec@mailo.com>
This commit is contained in:
Juraj Oravec 2020-12-21 03:03:32 +01:00
parent 5c1297cc5a
commit d8a8122bdc
Signed by: SGOrava
GPG Key ID: 13660A3F1D9F093B
4 changed files with 292 additions and 122 deletions

24
chapter.py Normal file
View File

@ -0,0 +1,24 @@
import re
class Chapter:
def __init__(self, title, text, number=0):
self.title = title
self.text = text
self.number = number
def getHtml(self):
chapterHtml = """<html>
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body>
<h3>{number} - {title}</h3>
<div class="ebook_chapter_content">
{body}
</div>
</body>
</html>""".format(number=self.number, title=self.title, body=self.text)
return chapterHtml

126
ebook.py Normal file
View File

@ -0,0 +1,126 @@
import os
from typing import List
from ebooklib import epub
from chapter import Chapter
class Ebook:
def __init__(self, title : str, chapters : List[Chapter], author : str = "",
coverArt : str = ""):
self.title = title
self.chapters = chapters
self.author = author
self.coverArt = coverArt
self.ebookChapters = list()
def create(self):
self.ebook = epub.EpubBook()
self.ebook.set_identifier("ebook-%s" % self.title)
self.ebook.set_title(self.title)
self.ebook.set_language('en')
if self.author:
self.ebook.add_author(self.author)
self.docStyle = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
self.ebook.add_item(self.docStyle)
self.cover()
self.addChapters()
filename = "{basename}.epub".format(basename=self.title)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, self.ebook, {})
def cover(self):
if not self.ebook:
return
content = "<h1>{title}</h1>".format(title=self.title)
if self.author:
content += "Written by {author}".format(author=self.author)
self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
self.intro_ch.add_item(self.docStyle)
self.intro_ch.content = """<html>
<head>
<title>Cover</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
{content}
</body>
</html>
""".format(content=content)
self.ebook.add_item(self.intro_ch)
if self.coverArt:
self.book_artwork = epub.EpubItem(
uid="book_artwork",
file_name="media/artwork.jpg",
media_type="image/jpeg",
content=open(self.coverArt).read()
)
self.ebook.add_item(self.book_artwork)
art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
art_ch.add_item(self.docStyle)
art_ch.add_item(self.book_artwork)
art_ch.content = """<html>
<head>
<title>Artwork</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
</body>
</html>
"""
self.ebook.add_item(art_ch)
def addChapters(self):
if not self.ebook:
return
if not self.chapters:
return
for chapterData in self.chapters:
chapter = epub.EpubHtml(
title=chapterData.title,
file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
)
chapter.add_item(self.docStyle)
chapter.content = chapterData.getHtml()
self.ebook.add_item(chapter)
self.ebookChapters.append(chapter)
self.tableOfContent()
def tableOfContent(self):
toc = list()
if self.coverArt:
toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
if self.chapters:
toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
toc.append((epub.Section('Chapters'), self.ebookChapters))
# Set the TOC
self.ebook.toc = tuple(toc)
# add navigation files
self.ebook.add_item(epub.EpubNcx())
self.ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(self.docStyle)
self.ebook.add_item(nav_page)
self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters

View File

@ -4,146 +4,81 @@ Script to downloaded webpages, extract text and merge all of them
together to create one ebook.
"""
import errno
import os
import re
import argparse
from bs4 import BeautifulSoup
from ebooklib import epub
from newspaper import Article
from importlib import import_module
from urllib.parse import urlparse
from typing import List
# Book name
name = "Name"
# Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-"
# Number of all chapter
numberOfChapters = 1
# Start from 0 or 1 ?
fromZero = False
from chapter import Chapter
from ebook import Ebook
# numerical representation of start for script
start = 0 if fromZero else 1
class NovelDownload:
def __init__(self, url):
self.url = url
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
self.title : str = ""
self.author : str = ""
chapters = []
self.chapters : List[Chapter] = list()
self.parser = None
if not self.loadModule():
print("Url is not supported")
def download(number):
"""Download webpage, extract main article, save result"""
url = "%s%d" % (mainUrl, number)
def loadModule(self):
url = urlparse(self.url)
moduleName = url.netloc.replace(".", "_")
article = Article(url)
article.download()
article.parse()
# import parser
try:
parserPackage = import_module('parsers.' + moduleName)
except ImportError:
print("Parser module not found: " + moduleName)
return False
# return getattr(module, name)
chapterText = "%s - %d" % (name, number)
header = False
self.parser = parserPackage.Parser(self.url)
for match in re_paragraph.finditer(article.text):
paragraph = match.group()
paragraph = paragraph.strip()
return True
if paragraph != "Previous ChapterNext Chapter":
if not header:
chapterText += "<h2>%s</h2>" % (paragraph)
header = True
else:
chapterText += "<p>%s</p>\n" % (paragraph)
def download(self):
if not self.parser:
print("There was an error")
return
chapterHtml = BeautifulSoup(
"""<html>
<head>
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
self.parser.prepare()
self.author = self.parser.getAuthor()
self.title = self.parser.getTitle()
return str(chapterText)
for x in range(1, len(self.parser.getUrls()), 1):
number, title, content = self.parser.nextChapter()
if number == -1:
number = x
print("{number} - {title}".format(number=number, title=title))
chapter = Chapter(title, content, number)
self.chapters.append(chapter)
def packageEbook():
ebook = epub.EpubBook()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def save(self):
book = Ebook(self.title, self.chapters, self.author)
book.create()
def main():
"""Start main downloader and converter"""
parser = argparse.ArgumentParser(description='Webnovel downloader')
parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
help='Url of the index page or first chapter, depends on parser support.')
parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
args = parser.parse_args()
nd = NovelDownload(args.url[0])
# Download all chapters one by one
print("Downloading...")
for i in range(start, numberOfChapters + 1, 1):
print("Downloading: ", name, i)
chapters.append(download(i))
packageEbook()
nd.download()
print("Saving...")
nd.save()
print("Done")

85
parsers/boxnovel_com.py Normal file
View File

@ -0,0 +1,85 @@
import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List
class Parser:
def __init__(self, url : str):
self.indexUrl : str = url
self.chapterUrls : List[str] = list()
self.indexPage : BeautifulSoup = None
self.chapterCycle : cycle = None
def prepare(self):
self.downloadIndex()
self.parseIndex()
def nextChapter(self):
url = next(self.chapterCycle)
response = requests.get(url)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
page = BeautifulSoup(response.text, "lxml")
# Extract chapter
content = page.find("div", {"class": "reading-content"})
if content is None:
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
# Get title
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
titleText = titleText.text
try:
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
title = title.strip()
except Exception:
number = -1
title = titleText
self.decompose(content.findChildren("script", recursive=True))
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
return number, title.strip(), content.contents[1]
def getAuthor(self):
author = self.indexPage.find("div", {"class": "author-content"})
return author.text.strip()
def getTitle(self):
title = self.indexPage.find("div", {"class": "post-title"})
return title.text.strip()
def getUrls(self):
return self.chapterUrls
def decompose(self, objects):
for obj in objects:
obj.decompose()
def downloadIndex(self):
response = requests.get(self.indexUrl)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
self.indexPage = BeautifulSoup(response.text, "lxml")
def parseIndex(self):
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
if chapterListTag is None:
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
chapters = chapterListTag.findChildren("a", recursive=True)
if chapters is None:
raise RuntimeError('Failed to find a links to chapters <a>')
for chapter in chapters:
self.chapterUrls.append(chapter.get('href'))
self.chapterUrls.reverse()
self.chapterCycle = cycle(self.chapterUrls)