Compare commits

..

No commits in common. "master" and "v0.1.0" have entirely different histories.

4 changed files with 122 additions and 292 deletions

View File

@ -1,24 +0,0 @@
import re
class Chapter:
def __init__(self, title, text, number=0):
self.title = title
self.text = text
self.number = number
def getHtml(self):
chapterHtml = """<html>
<head>
<title>{title}</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body>
<h3>{number} - {title}</h3>
<div class="ebook_chapter_content">
{body}
</div>
</body>
</html>""".format(number=self.number, title=self.title, body=self.text)
return chapterHtml

126
ebook.py
View File

@ -1,126 +0,0 @@
import os
from typing import List
from ebooklib import epub
from chapter import Chapter
class Ebook:
def __init__(self, title : str, chapters : List[Chapter], author : str = "",
coverArt : str = ""):
self.title = title
self.chapters = chapters
self.author = author
self.coverArt = coverArt
self.ebookChapters = list()
def create(self):
self.ebook = epub.EpubBook()
self.ebook.set_identifier("ebook-%s" % self.title)
self.ebook.set_title(self.title)
self.ebook.set_language('en')
if self.author:
self.ebook.add_author(self.author)
self.docStyle = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
self.ebook.add_item(self.docStyle)
self.cover()
self.addChapters()
filename = "{basename}.epub".format(basename=self.title)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, self.ebook, {})
def cover(self):
if not self.ebook:
return
content = "<h1>{title}</h1>".format(title=self.title)
if self.author:
content += "Written by {author}".format(author=self.author)
self.intro_ch = epub.EpubHtml(title="Cover", file_name='cover.xhtml')
self.intro_ch.add_item(self.docStyle)
self.intro_ch.content = """<html>
<head>
<title>Cover</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
{content}
</body>
</html>
""".format(content=content)
self.ebook.add_item(self.intro_ch)
if self.coverArt:
self.book_artwork = epub.EpubItem(
uid="book_artwork",
file_name="media/artwork.jpg",
media_type="image/jpeg",
content=open(self.coverArt).read()
)
self.ebook.add_item(self.book_artwork)
art_ch = epub.EpubHtml(title="Artwork", file_name='artwork.xhtml')
art_ch.add_item(self.docStyle)
art_ch.add_item(self.book_artwork)
art_ch.content = """<html>
<head>
<title>Artwork</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<img class="intro_artwork" src="media/artwork.jpg" alt="Artwork" />
</body>
</html>
"""
self.ebook.add_item(art_ch)
def addChapters(self):
if not self.ebook:
return
if not self.chapters:
return
for chapterData in self.chapters:
chapter = epub.EpubHtml(
title=chapterData.title,
file_name='Chapter-{number}.xhtml'.format(number=chapterData.number)
)
chapter.add_item(self.docStyle)
chapter.content = chapterData.getHtml()
self.ebook.add_item(chapter)
self.ebookChapters.append(chapter)
self.tableOfContent()
def tableOfContent(self):
toc = list()
if self.coverArt:
toc.append(epub.Link('artwork.xhtml', 'Artvork', 'artwork'))
if self.chapters:
toc.append(epub.Link('cover.xhtml', 'Cover', 'cover'))
toc.append((epub.Section('Chapters'), self.ebookChapters))
# Set the TOC
self.ebook.toc = tuple(toc)
# add navigation files
self.ebook.add_item(epub.EpubNcx())
self.ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(self.docStyle)
self.ebook.add_item(nav_page)
self.ebook.spine = [self.intro_ch, nav_page] + self.ebookChapters

View File

@ -4,84 +4,149 @@ Script to downloaded webpages, extract text and merge all of them
together to create one ebook. together to create one ebook.
""" """
import argparse import errno
import os
import re
from importlib import import_module from bs4 import BeautifulSoup
from urllib.parse import urlparse from ebooklib import epub
from typing import List from newspaper import Article
from chapter import Chapter # Book name
from ebook import Ebook name = "Name"
# Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-"
# Number of all chapter
numberOfChapters = 1
# Start from 0 or 1 ?
fromZero = False
class NovelDownload: # numerical representation of start for script
def __init__(self, url): start = 0 if fromZero else 1
self.url = url
self.title : str = "" re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
self.author : str = ""
self.chapters : List[Chapter] = list() chapters = []
self.parser = None
if not self.loadModule():
print("Url is not supported")
def loadModule(self): def download(number):
url = urlparse(self.url) """Download webpage, extract main article, save result"""
moduleName = url.netloc.replace(".", "_") url = "%s%d" % (mainUrl, number)
# import parser article = Article(url)
try: article.download()
parserPackage = import_module('parsers.' + moduleName) article.parse()
except ImportError:
print("Parser module not found: " + moduleName)
return False
# return getattr(module, name)
self.parser = parserPackage.Parser(self.url) chapterText = "%s - %d" % (name, number)
header = False
return True for match in re_paragraph.finditer(article.text):
paragraph = match.group()
paragraph = paragraph.strip()
def download(self): if paragraph != "Previous ChapterNext Chapter":
if not self.parser: if not header:
print("There was an error") chapterText += "<h2>%s</h2>" % (paragraph)
return header = True
else:
chapterText += "<p>%s</p>\n" % (paragraph)
self.parser.prepare() chapterHtml = BeautifulSoup(
self.author = self.parser.getAuthor() """<html>
self.title = self.parser.getTitle() <head>
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
for x in range(1, len(self.parser.getUrls()), 1): return str(chapterText)
number, title, content = self.parser.nextChapter()
if number == -1:
number = x
print("{number} - {title}".format(number=number, title=title))
chapter = Chapter(title, content, number)
self.chapters.append(chapter)
def save(self):
book = Ebook(self.title, self.chapters, self.author) def packageEbook():
book.create() ebook = epub.EpubBook()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def main(): def main():
parser = argparse.ArgumentParser(description='Webnovel downloader') """Start main downloader and converter"""
parser.add_argument('-u', '--url', metavar='URL', type=str, nargs=1, required=True,
help='Url of the index page or first chapter, depends on parser support.')
parser.add_argument('--version', action='version', version='%(prog)s 1.0.0')
args = parser.parse_args()
nd = NovelDownload(args.url[0]) # Download all chapters one by one
print("Downloading...")
for i in range(start, numberOfChapters + 1, 1):
print("Downloading: ", name, i)
chapters.append(download(i))
# Download all chapters one by one packageEbook()
print("Downloading...")
nd.download() print("Done")
print("Saving...")
nd.save()
print("Done")
# if yused standalone start the script # if yused standalone start the script
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -1,85 +0,0 @@
import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List
class Parser:
def __init__(self, url : str):
self.indexUrl : str = url
self.chapterUrls : List[str] = list()
self.indexPage : BeautifulSoup = None
self.chapterCycle : cycle = None
def prepare(self):
self.downloadIndex()
self.parseIndex()
def nextChapter(self):
url = next(self.chapterCycle)
response = requests.get(url)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
page = BeautifulSoup(response.text, "lxml")
# Extract chapter
content = page.find("div", {"class": "reading-content"})
if content is None:
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
# Get title
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
titleText = titleText.text
try:
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
title = title.strip()
except Exception:
number = -1
title = titleText
self.decompose(content.findChildren("script", recursive=True))
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
return number, title.strip(), content.contents[1]
def getAuthor(self):
author = self.indexPage.find("div", {"class": "author-content"})
return author.text.strip()
def getTitle(self):
title = self.indexPage.find("div", {"class": "post-title"})
return title.text.strip()
def getUrls(self):
return self.chapterUrls
def decompose(self, objects):
for obj in objects:
obj.decompose()
def downloadIndex(self):
response = requests.get(self.indexUrl)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
self.indexPage = BeautifulSoup(response.text, "lxml")
def parseIndex(self):
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
if chapterListTag is None:
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
chapters = chapterListTag.findChildren("a", recursive=True)
if chapters is None:
raise RuntimeError('Failed to find a links to chapters <a>')
for chapter in chapters:
self.chapterUrls.append(chapter.get('href'))
self.chapterUrls.reverse()
self.chapterCycle = cycle(self.chapterUrls)