Rewrtie using python libraries, use tabs

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
Juraj Oravec 2018-01-03 16:12:09 +01:00
parent 0cb72ef82d
commit c09655294a
No known key found for this signature in database
GPG Key ID: 63ACB65056BC8D07
2 changed files with 120 additions and 58 deletions

View File

@ -1,12 +1,13 @@
* NovelDownloader * NovelDownloader
Script to download each chapter of the webnovel, merge them together and convert to epub Script to download each chapter of the webnovel and save all as epub.
* Dependencies * Dependencies
- ebook-convert (Calibre) - python-beautifulsoup4
- python-ebooklib
- python-newspaper3k - python-newspaper3k
* Usage * Usage
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script.
* Author`s comment * Author`s comment
Slowly getting rid of external commands. Basic function is done, requires refactoring.

View File

@ -6,18 +6,18 @@ together to create one ebook.
import errno import errno
import os import os
import shutil import re
import subprocess
import sys
from newspaper import Article
from bs4 import BeautifulSoup
from ebooklib import epub
from newspaper import Article
# Book name # Book name
name = "Name" name = "Name"
# Base url, this is used as url = mainUrl + <number of chapter> # Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-" mainUrl = "http://example.com/chapter-"
# Number of all chapter # Number of all chapter
chapters = 1 numberOfChapters = 1
# Start from 0 or 1 ? # Start from 0 or 1 ?
fromZero = False fromZero = False
@ -25,67 +25,128 @@ fromZero = False
# numerical representation of start for script # numerical representation of start for script
start = 0 if fromZero else 1 start = 0 if fromZero else 1
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
def runInShell(command): chapters = []
"""Run giver string in shell"""
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
process.wait()
process.communicate()
def download(mainUrl, name, number): def download(number):
"""Download webpage, extract main article, save result""" """Download webpage, extract main article, save result"""
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number)) url = "%s%d" % (mainUrl, number)
url = "%s%d" % (mainUrl, number)
article = Article(url) article = Article(url)
article.download() article.download()
article.parse() article.parse()
# save file chapterText = "%s - %d" % (name, number)
try: header = False
file = open(fileName, "w")
file.write(article.text) for match in re_paragraph.finditer(article.text):
file.close() paragraph = match.group()
except OSError as err: paragraph = paragraph.strip()
print("OS error: {0}".format(err))
except: if paragraph != "Previous ChapterNext Chapter":
print("Unexpected error:", sys.exc_info()[0]) if not header:
raise chapterText += "<h2>%s</h2>" % (paragraph)
header = True
else:
chapterText += "<p>%s</p>\n" % (paragraph)
chapterHtml = BeautifulSoup(
"""<html>
<head>
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
return str(chapterText)
def packageEbook():
ebook = epub.EpubBook()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def main(): def main():
"""Start main downloader and converter""" """Start main downloader and converter"""
try:
os.makedirs("chapters")
except OSError as e:
if e.errno != errno.EEXIST:
raise
# Download all chapters one by one # Download all chapters one by one
print("Downloading...") print("Downloading...")
for i in range(start, chapters + 1, 1): for i in range(start, numberOfChapters + 1, 1):
print("Downloading: ", name, i) print("Downloading: ", name, i)
download(mainUrl, name, i) chapters.append(download(i))
# merge all chapter to one file packageEbook()
print("Merging...")
command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
runInShell(command)
# convert to epub print("Done")
print("Converting...")
command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
runInShell(command)
# remove download directory
print("Removing temporary data...")
shutil.rmtree('chapters')
os.remove("%s.txt" % (name))
print("Done")
# if yused standalone start the script # if yused standalone start the script
if __name__ == '__main__': if __name__ == '__main__':
main() main()