Rewrite to a more modular style
- Only one module is available for now: boxnovel.com - There a re still some problems but it is getting better Signed-off-by: Juraj Oravec <jurajoravec@mailo.com>
This commit is contained in:
parent
5c1297cc5a
commit
d8a8122bdc
24
chapter.py
Normal file
24
chapter.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class Chapter:
|
||||||
|
def __init__(self, title, text, number=0):
|
||||||
|
self.title = title
|
||||||
|
self.text = text
|
||||||
|
self.number = number
|
||||||
|
|
||||||
|
def getHtml(self):
|
||||||
|
chapterHtml = """<html>
|
||||||
|
<head>
|
||||||
|
<title>{title}</title>
|
||||||
|
<link rel="stylesheet" type="text/css" href="style/main.css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h3>{number} - {title}</h3>
|
||||||
|
<div class="ebook_chapter_content">
|
||||||
|
{body}
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>""".format(number=self.number, title=self.title, body=self.text)
|
||||||
|
|
||||||
|
return chapterHtml
|
126
ebook.py
Normal file
126
ebook.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
from ebooklib import epub
|
||||||
|
from chapter import Chapter
|
||||||
|
|
||||||
|
|
||||||
|
class Ebook:
|
||||||
|
def __init__(self, title : str, chapters : List[Chapter], author : str = "",
|
||||||
|
coverArt : str = ""):
|
||||||
|
self.title = title
|
||||||
|
self.chapters = chapters
|
||||||
|
self.author = author
|
||||||
|
self.coverArt = coverArt
|
||||||
|
|
||||||
|
self.ebookChapters = list()
|
||||||
|
|
||||||
|
def create(self):
|
||||||
|
self.ebook = epub.EpubBook()
|
||||||
|
self.ebook.set_identifier("ebook-%s" % self.title)
|
||||||
|
self.ebook.set_title(self.title)
|
||||||
|
self.ebook.set_language('en')
|
||||||
|
if self.author:
|
||||||
|
self.ebook.add_author(self.author)
|
||||||
|
|
||||||
|
self.docStyle = epub.EpubItem(
|
||||||
|
uid="doc_style",
|
||||||
|
file_name="style/main.css",
|
||||||
|
media_type="text/css",
|
||||||
|
content=open("template/style.css").read()
|
||||||
|
)
|
||||||
|
self.ebook.add_item(self.docStyle)
|
||||||
|
|
||||||
|
self.cover()
|
||||||
|
self.addChapters()
|
||||||
|
|
||||||
|
filename = "{basename}.epub".format(basename=self.title)
|
||||||
|
print("Saving to '%s'" % filename)
|
||||||
|
if os.path.exists(filename):
|
||||||
|
os.remove(filename)
|
||||||
|
epub.write_epub(filename, self.ebook, {})
|
||||||
|
|
||||||
|
def cover(self):
|
||||||
|
if not self.ebook:
|
||||||
|
return
|
||||||
|
|
||||||
|
content = "<h1>{title}</h1>".format(title=self.title)
|
||||||
|
if self.author:
|
||||||
|
content += "Written by {author}".format(author=self.author)
|
||||||
|
|
||||||
|
self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
|
||||||
|
self.intro_ch.add_item(self.docStyle)
|
||||||
|
self.intro_ch.content = """<html>
|
||||||
|
<head>
|
||||||
|
<title>Cover</title>
|
||||||
|
<link rel="stylesheet" href="style/main.css" type="text/css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{content}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".format(content=content)
|
||||||
|
self.ebook.add_item(self.intro_ch)
|
||||||
|
|
||||||
|
if self.coverArt:
|
||||||
|
self.book_artwork = epub.EpubItem(
|
||||||
|
uid="book_artwork",
|
||||||
|
file_name="media/artwork.jpg",
|
||||||
|
media_type="image/jpeg",
|
||||||
|
content=open(self.coverArt).read()
|
||||||
|
)
|
||||||
|
self.ebook.add_item(self.book_artwork)
|
||||||
|
|
||||||
|
art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
|
||||||
|
art_ch.add_item(self.docStyle)
|
||||||
|
art_ch.add_item(self.book_artwork)
|
||||||
|
art_ch.content = """<html>
|
||||||
|
<head>
|
||||||
|
<title>Artwork</title>
|
||||||
|
<link rel="stylesheet" href="style/main.css" type="text/css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
self.ebook.add_item(art_ch)
|
||||||
|
|
||||||
|
def addChapters(self):
|
||||||
|
if not self.ebook:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.chapters:
|
||||||
|
return
|
||||||
|
|
||||||
|
for chapterData in self.chapters:
|
||||||
|
chapter = epub.EpubHtml(
|
||||||
|
title=chapterData.title,
|
||||||
|
file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
|
||||||
|
)
|
||||||
|
chapter.add_item(self.docStyle)
|
||||||
|
chapter.content = chapterData.getHtml()
|
||||||
|
self.ebook.add_item(chapter)
|
||||||
|
self.ebookChapters.append(chapter)
|
||||||
|
|
||||||
|
self.tableOfContent()
|
||||||
|
|
||||||
|
def tableOfContent(self):
|
||||||
|
toc = list()
|
||||||
|
if self.coverArt:
|
||||||
|
toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
|
||||||
|
if self.chapters:
|
||||||
|
toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
|
||||||
|
toc.append((epub.Section('Chapters'), self.ebookChapters))
|
||||||
|
|
||||||
|
# Set the TOC
|
||||||
|
self.ebook.toc = tuple(toc)
|
||||||
|
# add navigation files
|
||||||
|
self.ebook.add_item(epub.EpubNcx())
|
||||||
|
self.ebook.add_item(epub.EpubNav())
|
||||||
|
|
||||||
|
# Create spine
|
||||||
|
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
|
||||||
|
nav_page.add_item(self.docStyle)
|
||||||
|
self.ebook.add_item(nav_page)
|
||||||
|
self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters
|
@ -4,149 +4,84 @@ Script to downloaded webpages, extract text and merge all of them
|
|||||||
together to create one ebook.
|
together to create one ebook.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import errno
|
import argparse
|
||||||
import os
|
|
||||||
import re
|
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from importlib import import_module
|
||||||
from ebooklib import epub
|
from urllib.parse import urlparse
|
||||||
from newspaper import Article
|
from typing import List
|
||||||
|
|
||||||
# Book name
|
from chapter import Chapter
|
||||||
name = "Name"
|
from ebook import Ebook
|
||||||
# Base url, this is used as url = mainUrl + <number of chapter>
|
|
||||||
mainUrl = "http://example.com/chapter-"
|
|
||||||
# Number of all chapter
|
|
||||||
numberOfChapters = 1
|
|
||||||
# Start from 0 or 1 ?
|
|
||||||
fromZero = False
|
|
||||||
|
|
||||||
|
|
||||||
# numerical representation of start for script
|
class NovelDownload:
|
||||||
start = 0 if fromZero else 1
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
|
||||||
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
|
self.title : str = ""
|
||||||
|
self.author : str = ""
|
||||||
|
|
||||||
chapters = []
|
self.chapters : List[Chapter] = list()
|
||||||
|
self.parser = None
|
||||||
|
|
||||||
|
if not self.loadModule():
|
||||||
|
print("Url is not supported")
|
||||||
|
|
||||||
def download(number):
|
def loadModule(self):
|
||||||
"""Download webpage, extract main article, save result"""
|
url = urlparse(self.url)
|
||||||
url = "%s%d" % (mainUrl, number)
|
moduleName = url.netloc.replace(".", "_")
|
||||||
|
|
||||||
article = Article(url)
|
# import parser
|
||||||
article.download()
|
try:
|
||||||
article.parse()
|
parserPackage = import_module('parsers.' + moduleName)
|
||||||
|
except ImportError:
|
||||||
|
print("Parser module not found: " + moduleName)
|
||||||
|
return False
|
||||||
|
# return getattr(module, name)
|
||||||
|
|
||||||
chapterText = "%s - %d" % (name, number)
|
self.parser = parserPackage.Parser(self.url)
|
||||||
header = False
|
|
||||||
|
|
||||||
for match in re_paragraph.finditer(article.text):
|
return True
|
||||||
paragraph = match.group()
|
|
||||||
paragraph = paragraph.strip()
|
|
||||||
|
|
||||||
if paragraph != "Previous ChapterNext Chapter":
|
def download(self):
|
||||||
if not header:
|
if not self.parser:
|
||||||
chapterText += "<h2>%s</h2>" % (paragraph)
|
print("There was an error")
|
||||||
header = True
|
return
|
||||||
else:
|
|
||||||
chapterText += "<p>%s</p>\n" % (paragraph)
|
|
||||||
|
|
||||||
chapterHtml = BeautifulSoup(
|
self.parser.prepare()
|
||||||
"""<html>
|
self.author = self.parser.getAuthor()
|
||||||
<head>
|
self.title = self.parser.getTitle()
|
||||||
<title>...</title>
|
|
||||||
<link rel="stylesheet" type="text/css" href="style/main.css" />
|
|
||||||
</head>
|
|
||||||
<body></body>
|
|
||||||
</html>""",
|
|
||||||
'lxml'
|
|
||||||
)
|
|
||||||
chapterHtml.head.title.string = article.title
|
|
||||||
chapterHtml.body.append(chapterText)
|
|
||||||
|
|
||||||
return str(chapterText)
|
for x in range(1, len(self.parser.getUrls()), 1):
|
||||||
|
number, title, content = self.parser.nextChapter()
|
||||||
|
if number == -1:
|
||||||
|
number = x
|
||||||
|
print("{number} - {title}".format(number=number, title=title))
|
||||||
|
chapter = Chapter(title, content, number)
|
||||||
|
self.chapters.append(chapter)
|
||||||
|
|
||||||
|
def save(self):
|
||||||
def packageEbook():
|
book = Ebook(self.title, self.chapters, self.author)
|
||||||
ebook = epub.EpubBook()
|
book.create()
|
||||||
ebook.set_identifier("ebook-%s" % name)
|
|
||||||
ebook.set_title(name)
|
|
||||||
ebook.set_language('en')
|
|
||||||
doc_style = epub.EpubItem(
|
|
||||||
uid="doc_style",
|
|
||||||
file_name="style/main.css",
|
|
||||||
media_type="text/css",
|
|
||||||
content=open("template/style.css").read()
|
|
||||||
)
|
|
||||||
ebook.add_item(doc_style)
|
|
||||||
|
|
||||||
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
|
|
||||||
intro_ch.add_item(doc_style)
|
|
||||||
intro_ch.content = """
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<title>Introduction</title>
|
|
||||||
<link rel="stylesheet" href="style/main.css" type="text/css" />
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>%s</h1>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
""" % (name)
|
|
||||||
ebook.add_item(intro_ch)
|
|
||||||
|
|
||||||
ebookChapters = []
|
|
||||||
|
|
||||||
i = start
|
|
||||||
for chapter_data in chapters:
|
|
||||||
chapter = epub.EpubHtml(
|
|
||||||
title="%s - %d" % (name, i),
|
|
||||||
file_name='%s-%d.xhtml' % (name, i)
|
|
||||||
)
|
|
||||||
chapter.add_item(doc_style)
|
|
||||||
chapter.content = chapter_data
|
|
||||||
ebook.add_item(chapter)
|
|
||||||
ebookChapters.append(chapter)
|
|
||||||
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# Set the TOC
|
|
||||||
ebook.toc = (
|
|
||||||
epub.Link('intro.xhtml', 'Introduction', 'intro'),
|
|
||||||
(epub.Section('Chapters'), ebookChapters)
|
|
||||||
)
|
|
||||||
# add navigation files
|
|
||||||
ebook.add_item(epub.EpubNcx())
|
|
||||||
ebook.add_item(epub.EpubNav())
|
|
||||||
|
|
||||||
# Create spine
|
|
||||||
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
|
|
||||||
nav_page.add_item(doc_style)
|
|
||||||
ebook.add_item(nav_page)
|
|
||||||
ebook.spine = [intro_ch, nav_page] + ebookChapters
|
|
||||||
|
|
||||||
filename = '%s.epub' % (name)
|
|
||||||
print("Saving to '%s'" % filename)
|
|
||||||
if os.path.exists(filename):
|
|
||||||
os.remove(filename)
|
|
||||||
epub.write_epub(filename, ebook, {})
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Start main downloader and converter"""
|
parser = argparse.ArgumentParser(description='Webnovel downloader')
|
||||||
|
parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
|
||||||
|
help='Url of the index page or first chapter, depends on parser support.')
|
||||||
|
parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Download all chapters one by one
|
nd = NovelDownload(args.url[0])
|
||||||
print("Downloading...")
|
|
||||||
for i in range(start, numberOfChapters + 1, 1):
|
|
||||||
print("Downloading: ", name, i)
|
|
||||||
chapters.append(download(i))
|
|
||||||
|
|
||||||
packageEbook()
|
# Download all chapters one by one
|
||||||
|
print("Downloading...")
|
||||||
print("Done")
|
nd.download()
|
||||||
|
print("Saving...")
|
||||||
|
nd.save()
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
|
||||||
# if yused standalone start the script
|
# if yused standalone start the script
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
85
parsers/boxnovel_com.py
Normal file
85
parsers/boxnovel_com.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import re
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from itertools import cycle
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
def __init__(self, url : str):
|
||||||
|
self.indexUrl : str = url
|
||||||
|
|
||||||
|
self.chapterUrls : List[str] = list()
|
||||||
|
self.indexPage : BeautifulSoup = None
|
||||||
|
self.chapterCycle : cycle = None
|
||||||
|
|
||||||
|
def prepare(self):
|
||||||
|
self.downloadIndex()
|
||||||
|
self.parseIndex()
|
||||||
|
|
||||||
|
def nextChapter(self):
|
||||||
|
url = next(self.chapterCycle)
|
||||||
|
|
||||||
|
response = requests.get(url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
||||||
|
page = BeautifulSoup(response.text, "lxml")
|
||||||
|
|
||||||
|
# Extract chapter
|
||||||
|
content = page.find("div", {"class": "reading-content"})
|
||||||
|
if content is None:
|
||||||
|
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
|
||||||
|
|
||||||
|
# Get title
|
||||||
|
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
|
||||||
|
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
|
||||||
|
titleText = titleText.text
|
||||||
|
|
||||||
|
try:
|
||||||
|
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
|
||||||
|
title = title.strip()
|
||||||
|
except Exception:
|
||||||
|
number = -1
|
||||||
|
title = titleText
|
||||||
|
|
||||||
|
self.decompose(content.findChildren("script", recursive=True))
|
||||||
|
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
|
||||||
|
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
|
||||||
|
|
||||||
|
return number, title.strip(), content.contents[1]
|
||||||
|
|
||||||
|
def getAuthor(self):
|
||||||
|
author = self.indexPage.find("div", {"class": "author-content"})
|
||||||
|
return author.text.strip()
|
||||||
|
|
||||||
|
def getTitle(self):
|
||||||
|
title = self.indexPage.find("div", {"class": "post-title"})
|
||||||
|
return title.text.strip()
|
||||||
|
|
||||||
|
def getUrls(self):
|
||||||
|
return self.chapterUrls
|
||||||
|
|
||||||
|
def decompose(self, objects):
|
||||||
|
for obj in objects:
|
||||||
|
obj.decompose()
|
||||||
|
|
||||||
|
def downloadIndex(self):
|
||||||
|
response = requests.get(self.indexUrl)
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
||||||
|
|
||||||
|
self.indexPage = BeautifulSoup(response.text, "lxml")
|
||||||
|
|
||||||
|
def parseIndex(self):
|
||||||
|
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
|
||||||
|
if chapterListTag is None:
|
||||||
|
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
|
||||||
|
|
||||||
|
chapters = chapterListTag.findChildren("a", recursive=True)
|
||||||
|
if chapters is None:
|
||||||
|
raise RuntimeError('Failed to find a links to chapters <a>')
|
||||||
|
|
||||||
|
for chapter in chapters:
|
||||||
|
self.chapterUrls.append(chapter.get('href'))
|
||||||
|
self.chapterUrls.reverse()
|
||||||
|
self.chapterCycle = cycle(self.chapterUrls)
|
Loading…
Reference in New Issue
Block a user