Use python library "newspaper" to extract chapter

- Update README to reflect this change.
- Change file format of README org
- close #1

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
Juraj Oravec 2017-11-09 22:59:48 +01:00
parent 741c8f3c75
commit b963f275c9
No known key found for this signature in database
GPG Key ID: 63ACB65056BC8D07
2 changed files with 23 additions and 25 deletions

View File

@ -1,14 +1,12 @@
# NovelDownloader * NovelDownloader
Script to download each chapter of the webnovel, merge them together and convert to epub Script to download each chapter of the webnovel, merge them together and convert to epub
# Dependencies * Dependencies
- jq
- nodejs-unfluff
- ebook-convert (Calibre) - ebook-convert (Calibre)
- wget - python-newspaper3k
# Usage * Usage
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
# Author`s comment * Author`s comment
Bash would be better choice for this script but Python is cuter. Slowly getting rid of external commands.

View File

@ -8,6 +8,8 @@ import errno
import os import os
import shutil import shutil
import subprocess import subprocess
import sys
from newspaper import Article
# Book name # Book name
@ -32,26 +34,24 @@ def runInShell(command):
def download(mainUrl, name, number): def download(mainUrl, name, number):
"""Download webpage, extract test, add some empty lines""" """Download webpage, extract main article, save result"""
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number)) fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
url = "%s%d" % (mainUrl, number)
# download webpage article = Article(url)
command = 'wget -q -O- "%s%d" | unfluff | jq -r ".title, .text" > "%s"' %\ article.download()
(mainUrl, number, fileName) article.parse()
runInShell(command)
# New line after title # save file
command = "sed -i '1 a\\\\' '%s'" % (fileName) try:
runInShell(command) file = open(fileName, "w")
file.write(article.text)
# New line at the end of the file file.close()
command = "sed -i -e '$a\\' '%s'" % (fileName) except OSError as err:
runInShell(command) print("OS error: {0}".format(err))
except:
if number != start: print("Unexpected error:", sys.exc_info()[0])
# New lne at beginning of the file raise
command = "sed -i '1i\\\\' '%s'" % (fileName)
runInShell(command)
def main(): def main():