Use python library "newspaper" to extract chapter

- Update README to reflect this change.
- Change file format of README org
- close #1

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
Juraj Oravec 2017-11-09 22:59:48 +01:00
parent 741c8f3c75
commit b963f275c9
No known key found for this signature in database
GPG Key ID: 63ACB65056BC8D07
2 changed files with 23 additions and 25 deletions

View File

@ -1,14 +1,12 @@
# NovelDownloader
* NovelDownloader
Script to download each chapter of the webnovel, merge them together and convert to epub
# Dependencies
- jq
- nodejs-unfluff
* Dependencies
- ebook-convert (Calibre)
- wget
- python-newspaper3k
# Usage
* Usage
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
# Author`s comment
Bash would be better choice for this script but Python is cuter.
* Author`s comment
Slowly getting rid of external commands.

View File

@ -8,6 +8,8 @@ import errno
import os
import shutil
import subprocess
import sys
from newspaper import Article
# Book name
@ -32,26 +34,24 @@ def runInShell(command):
def download(mainUrl, name, number):
"""Download webpage, extract test, add some empty lines"""
"""Download webpage, extract main article, save result"""
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
url = "%s%d" % (mainUrl, number)
# download webpage
command = 'wget -q -O- "%s%d" | unfluff | jq -r ".title, .text" > "%s"' %\
(mainUrl, number, fileName)
runInShell(command)
article = Article(url)
article.download()
article.parse()
# New line after title
command = "sed -i '1 a\\\\' '%s'" % (fileName)
runInShell(command)
# New line at the end of the file
command = "sed -i -e '$a\\' '%s'" % (fileName)
runInShell(command)
if number != start:
# New lne at beginning of the file
command = "sed -i '1i\\\\' '%s'" % (fileName)
runInShell(command)
# save file
try:
file = open(fileName, "w")
file.write(article.text)
file.close()
except OSError as err:
print("OS error: {0}".format(err))
except:
print("Unexpected error:", sys.exc_info()[0])
raise
def main():