Rewrtie using python libraries, use tabs

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
Juraj Oravec 2018-01-03 16:12:09 +01:00
parent 0cb72ef82d
commit c09655294a
No known key found for this signature in database
GPG Key ID: 63ACB65056BC8D07
2 changed files with 120 additions and 58 deletions

View File

@ -1,12 +1,13 @@
* NovelDownloader
Script to download each chapter of the webnovel, merge them together and convert to epub
Script to download each chapter of the webnovel and save all as epub.
* Dependencies
- ebook-convert (Calibre)
- python-beautifulsoup4
- python-ebooklib
- python-newspaper3k
* Usage
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script.
* Author`s comment
Slowly getting rid of external commands.
Basic function is done, requires refactoring.

View File

@ -6,18 +6,18 @@ together to create one ebook.
import errno
import os
import shutil
import subprocess
import sys
from newspaper import Article
import re
from bs4 import BeautifulSoup
from ebooklib import epub
from newspaper import Article
# Book name
name = "Name"
# Base url, this is used as url = mainUrl + <number of chapter>
mainUrl = "http://example.com/chapter-"
# Number of all chapter
chapters = 1
numberOfChapters = 1
# Start from 0 or 1 ?
fromZero = False
@ -25,63 +25,124 @@ fromZero = False
# numerical representation of start for script
start = 0 if fromZero else 1
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
def runInShell(command):
"""Run giver string in shell"""
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
process.wait()
process.communicate()
chapters = []
def download(mainUrl, name, number):
def download(number):
"""Download webpage, extract main article, save result"""
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
url = "%s%d" % (mainUrl, number)
article = Article(url)
article.download()
article.parse()
# save file
try:
file = open(fileName, "w")
file.write(article.text)
file.close()
except OSError as err:
print("OS error: {0}".format(err))
except:
print("Unexpected error:", sys.exc_info()[0])
raise
chapterText = "%s - %d" % (name, number)
header = False
for match in re_paragraph.finditer(article.text):
paragraph = match.group()
paragraph = paragraph.strip()
if paragraph != "Previous ChapterNext Chapter":
if not header:
chapterText += "<h2>%s</h2>" % (paragraph)
header = True
else:
chapterText += "<p>%s</p>\n" % (paragraph)
chapterHtml = BeautifulSoup(
"""<html>
<head>
<title>...</title>
<link rel="stylesheet" type="text/css" href="style/main.css" />
</head>
<body></body>
</html>""",
'lxml'
)
chapterHtml.head.title.string = article.title
chapterHtml.body.append(chapterText)
return str(chapterText)
def packageEbook():
ebook = epub.EpubBook()
ebook.set_identifier("ebook-%s" % name)
ebook.set_title(name)
ebook.set_language('en')
doc_style = epub.EpubItem(
uid="doc_style",
file_name="style/main.css",
media_type="text/css",
content=open("template/style.css").read()
)
ebook.add_item(doc_style)
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
intro_ch.add_item(doc_style)
intro_ch.content = """
<html>
<head>
<title>Introduction</title>
<link rel="stylesheet" href="style/main.css" type="text/css" />
</head>
<body>
<h1>%s</h1>
</body>
</html>
""" % (name)
ebook.add_item(intro_ch)
ebookChapters = []
i = start
for chapter_data in chapters:
chapter = epub.EpubHtml(
title="%s - %d" % (name, i),
file_name='%s-%d.xhtml' % (name, i)
)
chapter.add_item(doc_style)
chapter.content = chapter_data
ebook.add_item(chapter)
ebookChapters.append(chapter)
i += 1
# Set the TOC
ebook.toc = (
epub.Link('intro.xhtml', 'Introduction', 'intro'),
(epub.Section('Chapters'), ebookChapters)
)
# add navigation files
ebook.add_item(epub.EpubNcx())
ebook.add_item(epub.EpubNav())
# Create spine
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
nav_page.add_item(doc_style)
ebook.add_item(nav_page)
ebook.spine = [intro_ch, nav_page] + ebookChapters
filename = '%s.epub' % (name)
print("Saving to '%s'" % filename)
if os.path.exists(filename):
os.remove(filename)
epub.write_epub(filename, ebook, {})
def main():
"""Start main downloader and converter"""
try:
os.makedirs("chapters")
except OSError as e:
if e.errno != errno.EEXIST:
raise
# Download all chapters one by one
print("Downloading...")
for i in range(start, chapters + 1, 1):
for i in range(start, numberOfChapters + 1, 1):
print("Downloading: ", name, i)
download(mainUrl, name, i)
chapters.append(download(i))
# merge all chapter to one file
print("Merging...")
command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
runInShell(command)
# convert to epub
print("Converting...")
command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
runInShell(command)
# remove download directory
print("Removing temporary data...")
shutil.rmtree('chapters')
os.remove("%s.txt" % (name))
packageEbook()
print("Done")