NovelDownloader/novelDownloader.py
Juraj Oravec c09655294a
Rewrtie using python libraries, use tabs
Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
2018-01-03 16:12:09 +01:00

153 lines
3.2 KiB
Python
Executable File

#!/usr/bin/python3
"""
Script to downloaded webpages, extract text and merge all of them
together to create one ebook.
"""
import errno
import os
import re
from bs4 import BeautifulSoup
from ebooklib import epub
from newspaper import Article
# Book name
name = "Name"
# Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-"
# Number of all chapter
numberOfChapters = 1
# Start from 0 or 1 ?
fromZero = False
# numerical representation of start for script
start = 0 if fromZero else 1
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
chapters = []
def download(number):
"""Download webpage, extract main article, save result"""
url = "%s%d" % (mainUrl, number)
article = Article(url)
article.download()
article.parse()
chapterText = "%s - %d" % (name, number)
header = False
for match in re_paragraph.finditer(article.text):
paragraph = match.group()
paragraph = paragraph.strip()
if paragraph != "Previous ChapterNext Chapter":
if not header:
chapterText += "<h2>%s</h2>" % (paragraph)
header = True
else:
chapterText += "<p>%s</p>\n" % (paragraph)
chapterHtml = BeautifulSoup(
"""<html>
<head>
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
return str(chapterText)
def packageEbook():
ebook = epub.EpubBook()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def main():
"""Start main downloader and converter"""
# Download all chapters one by one
print("Downloading...")
for i in range(start, numberOfChapters + 1, 1):
print("Downloading: ", name, i)
chapters.append(download(i))
packageEbook()
print("Done")
# if yused standalone start the script
if __name__ == '__main__':
main()