2017-11-07 19:40:55 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
"""
|
|
|
|
Script to downloaded webpages, extract text and merge all of them
|
|
|
|
together to create one ebook.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import errno
|
|
|
|
import os
|
|
|
|
import shutil
|
|
|
|
import subprocess
|
2017-11-09 22:59:48 +01:00
|
|
|
import sys
|
|
|
|
from newspaper import Article
|
2017-11-07 19:40:55 +01:00
|
|
|
|
|
|
|
|
|
|
|
# Book name
|
|
|
|
name = "Name"
|
|
|
|
# Base url, this is used as url = mainUrl + <number of chapter>
|
|
|
|
mainUrl = "http://example.com/chapter-"
|
|
|
|
# Number of all chapter
|
|
|
|
chapters = 1
|
|
|
|
# Start from 0 or 1 ?
|
|
|
|
fromZero = False
|
|
|
|
|
|
|
|
|
|
|
|
# numerical representation of start for script
|
|
|
|
start = 0 if fromZero else 1
|
|
|
|
|
|
|
|
|
|
|
|
def runInShell(command):
|
|
|
|
"""Run giver string in shell"""
|
|
|
|
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
|
|
|
process.wait()
|
|
|
|
process.communicate()
|
|
|
|
|
|
|
|
|
|
|
|
def download(mainUrl, name, number):
|
2017-11-09 22:59:48 +01:00
|
|
|
"""Download webpage, extract main article, save result"""
|
2017-11-07 19:40:55 +01:00
|
|
|
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
|
2017-11-09 22:59:48 +01:00
|
|
|
url = "%s%d" % (mainUrl, number)
|
2017-11-07 19:40:55 +01:00
|
|
|
|
2017-11-09 22:59:48 +01:00
|
|
|
article = Article(url)
|
|
|
|
article.download()
|
|
|
|
article.parse()
|
2017-11-07 19:40:55 +01:00
|
|
|
|
2017-11-09 22:59:48 +01:00
|
|
|
# save file
|
|
|
|
try:
|
|
|
|
file = open(fileName, "w")
|
|
|
|
file.write(article.text)
|
|
|
|
file.close()
|
|
|
|
except OSError as err:
|
|
|
|
print("OS error: {0}".format(err))
|
|
|
|
except:
|
|
|
|
print("Unexpected error:", sys.exc_info()[0])
|
|
|
|
raise
|
2017-11-07 19:40:55 +01:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
"""Start main downloader and converter"""
|
|
|
|
try:
|
|
|
|
os.makedirs("chapters")
|
|
|
|
except OSError as e:
|
|
|
|
if e.errno != errno.EEXIST:
|
|
|
|
raise
|
|
|
|
|
|
|
|
# Download all chapters one by one
|
|
|
|
print("Downloading...")
|
|
|
|
for i in range(start, chapters + 1, 1):
|
|
|
|
print("Downloading: ", name, i)
|
|
|
|
download(mainUrl, name, i)
|
|
|
|
|
|
|
|
# merge all chapter to one file
|
|
|
|
print("Merging...")
|
|
|
|
command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
|
|
|
|
runInShell(command)
|
|
|
|
|
|
|
|
# convert to epub
|
|
|
|
print("Converting...")
|
|
|
|
command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
|
|
|
|
runInShell(command)
|
|
|
|
|
|
|
|
# remove download directory
|
|
|
|
print("Removing temporary data...")
|
|
|
|
shutil.rmtree('chapters')
|
|
|
|
os.remove("%s.txt" % (name))
|
|
|
|
|
|
|
|
print("Done")
|
|
|
|
|
|
|
|
|
|
|
|
# if yused standalone start the script
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|