Rewrtie using python libraries, use tabs
Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
parent
0cb72ef82d
commit
c09655294a
@ -1,12 +1,13 @@
|
||||
* NovelDownloader
|
||||
Script to download each chapter of the webnovel, merge them together and convert to epub
|
||||
Script to download each chapter of the webnovel and save all as epub.
|
||||
|
||||
* Dependencies
|
||||
- ebook-convert (Calibre)
|
||||
- python-beautifulsoup4
|
||||
- python-ebooklib
|
||||
- python-newspaper3k
|
||||
|
||||
* Usage
|
||||
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
|
||||
Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script.
|
||||
|
||||
* Author`s comment
|
||||
Slowly getting rid of external commands.
|
||||
Basic function is done, requires refactoring.
|
||||
|
@ -6,18 +6,18 @@ together to create one ebook.
|
||||
|
||||
import errno
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from newspaper import Article
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
from newspaper import Article
|
||||
|
||||
# Book name
|
||||
name = "Name"
|
||||
# Base url, this is used as url = mainUrl + <number of chapter>
|
||||
mainUrl = "http://example.com/chapter-"
|
||||
# Number of all chapter
|
||||
chapters = 1
|
||||
numberOfChapters = 1
|
||||
# Start from 0 or 1 ?
|
||||
fromZero = False
|
||||
|
||||
@ -25,67 +25,128 @@ fromZero = False
|
||||
# numerical representation of start for script
|
||||
start = 0 if fromZero else 1
|
||||
|
||||
re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
|
||||
|
||||
def runInShell(command):
|
||||
"""Run giver string in shell"""
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
process.wait()
|
||||
process.communicate()
|
||||
chapters = []
|
||||
|
||||
|
||||
def download(mainUrl, name, number):
|
||||
"""Download webpage, extract main article, save result"""
|
||||
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
|
||||
url = "%s%d" % (mainUrl, number)
|
||||
def download(number):
|
||||
"""Download webpage, extract main article, save result"""
|
||||
url = "%s%d" % (mainUrl, number)
|
||||
|
||||
article = Article(url)
|
||||
article.download()
|
||||
article.parse()
|
||||
article = Article(url)
|
||||
article.download()
|
||||
article.parse()
|
||||
|
||||
# save file
|
||||
try:
|
||||
file = open(fileName, "w")
|
||||
file.write(article.text)
|
||||
file.close()
|
||||
except OSError as err:
|
||||
print("OS error: {0}".format(err))
|
||||
except:
|
||||
print("Unexpected error:", sys.exc_info()[0])
|
||||
raise
|
||||
chapterText = "%s - %d" % (name, number)
|
||||
header = False
|
||||
|
||||
for match in re_paragraph.finditer(article.text):
|
||||
paragraph = match.group()
|
||||
paragraph = paragraph.strip()
|
||||
|
||||
if paragraph != "Previous ChapterNext Chapter":
|
||||
if not header:
|
||||
chapterText += "<h2>%s</h2>" % (paragraph)
|
||||
header = True
|
||||
else:
|
||||
chapterText += "<p>%s</p>\n" % (paragraph)
|
||||
|
||||
chapterHtml = BeautifulSoup(
|
||||
"""<html>
|
||||
<head>
|
||||
<title>...</title>
|
||||
<link rel="stylesheet" type="text/css" href="style/main.css" />
|
||||
</head>
|
||||
<body></body>
|
||||
</html>""",
|
||||
'lxml'
|
||||
)
|
||||
chapterHtml.head.title.string = article.title
|
||||
chapterHtml.body.append(chapterText)
|
||||
|
||||
return str(chapterText)
|
||||
|
||||
|
||||
def packageEbook():
|
||||
ebook = epub.EpubBook()
|
||||
ebook.set_identifier("ebook-%s" % name)
|
||||
ebook.set_title(name)
|
||||
ebook.set_language('en')
|
||||
doc_style = epub.EpubItem(
|
||||
uid="doc_style",
|
||||
file_name="style/main.css",
|
||||
media_type="text/css",
|
||||
content=open("template/style.css").read()
|
||||
)
|
||||
ebook.add_item(doc_style)
|
||||
|
||||
intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
|
||||
intro_ch.add_item(doc_style)
|
||||
intro_ch.content = """
|
||||
<html>
|
||||
<head>
|
||||
<title>Introduction</title>
|
||||
<link rel="stylesheet" href="style/main.css" type="text/css" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>%s</h1>
|
||||
</body>
|
||||
</html>
|
||||
""" % (name)
|
||||
ebook.add_item(intro_ch)
|
||||
|
||||
ebookChapters = []
|
||||
|
||||
i = start
|
||||
for chapter_data in chapters:
|
||||
chapter = epub.EpubHtml(
|
||||
title="%s - %d" % (name, i),
|
||||
file_name='%s-%d.xhtml' % (name, i)
|
||||
)
|
||||
chapter.add_item(doc_style)
|
||||
chapter.content = chapter_data
|
||||
ebook.add_item(chapter)
|
||||
ebookChapters.append(chapter)
|
||||
|
||||
i += 1
|
||||
|
||||
# Set the TOC
|
||||
ebook.toc = (
|
||||
epub.Link('intro.xhtml', 'Introduction', 'intro'),
|
||||
(epub.Section('Chapters'), ebookChapters)
|
||||
)
|
||||
# add navigation files
|
||||
ebook.add_item(epub.EpubNcx())
|
||||
ebook.add_item(epub.EpubNav())
|
||||
|
||||
# Create spine
|
||||
nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
|
||||
nav_page.add_item(doc_style)
|
||||
ebook.add_item(nav_page)
|
||||
ebook.spine = [intro_ch, nav_page] + ebookChapters
|
||||
|
||||
filename = '%s.epub' % (name)
|
||||
print("Saving to '%s'" % filename)
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
epub.write_epub(filename, ebook, {})
|
||||
|
||||
|
||||
def main():
|
||||
"""Start main downloader and converter"""
|
||||
try:
|
||||
os.makedirs("chapters")
|
||||
except OSError as e:
|
||||
if e.errno != errno.EEXIST:
|
||||
raise
|
||||
"""Start main downloader and converter"""
|
||||
|
||||
# Download all chapters one by one
|
||||
print("Downloading...")
|
||||
for i in range(start, chapters + 1, 1):
|
||||
print("Downloading: ", name, i)
|
||||
download(mainUrl, name, i)
|
||||
# Download all chapters one by one
|
||||
print("Downloading...")
|
||||
for i in range(start, numberOfChapters + 1, 1):
|
||||
print("Downloading: ", name, i)
|
||||
chapters.append(download(i))
|
||||
|
||||
# merge all chapter to one file
|
||||
print("Merging...")
|
||||
command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
|
||||
runInShell(command)
|
||||
packageEbook()
|
||||
|
||||
# convert to epub
|
||||
print("Converting...")
|
||||
command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
|
||||
runInShell(command)
|
||||
|
||||
# remove download directory
|
||||
print("Removing temporary data...")
|
||||
shutil.rmtree('chapters')
|
||||
os.remove("%s.txt" % (name))
|
||||
|
||||
print("Done")
|
||||
print("Done")
|
||||
|
||||
|
||||
# if yused standalone start the script
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user