From b963f275c9e98c3a627bd7af01fb222ad4393d67 Mon Sep 17 00:00:00 2001 From: Juraj Oravec Date: Thu, 9 Nov 2017 22:59:48 +0100 Subject: [PATCH] Use python library "newspaper" to extract chapter - Update README to reflect this change. - Change file format of README org - close #1 Signed-off-by: Juraj Oravec --- README.md => README.org | 14 ++++++-------- novelDownloader.py | 34 +++++++++++++++++----------------- 2 files changed, 23 insertions(+), 25 deletions(-) rename README.md => README.org (59%) diff --git a/README.md b/README.org similarity index 59% rename from README.md rename to README.org index 33c5e67..e4c867e 100644 --- a/README.md +++ b/README.org @@ -1,14 +1,12 @@ -# NovelDownloader +* NovelDownloader Script to download each chapter of the webnovel, merge them together and convert to epub -# Dependencies -- jq -- nodejs-unfluff +* Dependencies - ebook-convert (Calibre) -- wget +- python-newspaper3k -# Usage +* Usage Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script -# Author`s comment -Bash would be better choice for this script but Python is cuter. +* Author`s comment +Slowly getting rid of external commands. diff --git a/novelDownloader.py b/novelDownloader.py index e655247..580b697 100755 --- a/novelDownloader.py +++ b/novelDownloader.py @@ -8,6 +8,8 @@ import errno import os import shutil import subprocess +import sys +from newspaper import Article # Book name @@ -32,26 +34,24 @@ def runInShell(command): def download(mainUrl, name, number): - """Download webpage, extract test, add some empty lines""" + """Download webpage, extract main article, save result""" fileName = os.path.join('chapters', '%s-%d.txt' % (name, number)) + url = "%s%d" % (mainUrl, number) - # download webpage - command = 'wget -q -O- "%s%d" | unfluff | jq -r ".title, .text" > "%s"' %\ - (mainUrl, number, fileName) - runInShell(command) + article = Article(url) + article.download() + article.parse() - # New line after title - command = "sed -i '1 a\\\\' '%s'" % (fileName) - runInShell(command) - - # New line at the end of the file - command = "sed -i -e '$a\\' '%s'" % (fileName) - runInShell(command) - - if number != start: - # New lne at beginning of the file - command = "sed -i '1i\\\\' '%s'" % (fileName) - runInShell(command) + # save file + try: + file = open(fileName, "w") + file.write(article.text) + file.close() + except OSError as err: + print("OS error: {0}".format(err)) + except: + print("Unexpected error:", sys.exc_info()[0]) + raise def main():