Use python library "newspaper" to extract chapter
- Update README to reflect this change. - Change file format of README org - close #1 Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
This commit is contained in:
parent
741c8f3c75
commit
b963f275c9
@ -1,14 +1,12 @@
|
|||||||
# NovelDownloader
|
* NovelDownloader
|
||||||
Script to download each chapter of the webnovel, merge them together and convert to epub
|
Script to download each chapter of the webnovel, merge them together and convert to epub
|
||||||
|
|
||||||
# Dependencies
|
* Dependencies
|
||||||
- jq
|
|
||||||
- nodejs-unfluff
|
|
||||||
- ebook-convert (Calibre)
|
- ebook-convert (Calibre)
|
||||||
- wget
|
- python-newspaper3k
|
||||||
|
|
||||||
# Usage
|
* Usage
|
||||||
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
|
Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
|
||||||
|
|
||||||
# Author`s comment
|
* Author`s comment
|
||||||
Bash would be better choice for this script but Python is cuter.
|
Slowly getting rid of external commands.
|
@ -8,6 +8,8 @@ import errno
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from newspaper import Article
|
||||||
|
|
||||||
|
|
||||||
# Book name
|
# Book name
|
||||||
@ -32,26 +34,24 @@ def runInShell(command):
|
|||||||
|
|
||||||
|
|
||||||
def download(mainUrl, name, number):
|
def download(mainUrl, name, number):
|
||||||
"""Download webpage, extract test, add some empty lines"""
|
"""Download webpage, extract main article, save result"""
|
||||||
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
|
fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
|
||||||
|
url = "%s%d" % (mainUrl, number)
|
||||||
|
|
||||||
# download webpage
|
article = Article(url)
|
||||||
command = 'wget -q -O- "%s%d" | unfluff | jq -r ".title, .text" > "%s"' %\
|
article.download()
|
||||||
(mainUrl, number, fileName)
|
article.parse()
|
||||||
runInShell(command)
|
|
||||||
|
|
||||||
# New line after title
|
# save file
|
||||||
command = "sed -i '1 a\\\\' '%s'" % (fileName)
|
try:
|
||||||
runInShell(command)
|
file = open(fileName, "w")
|
||||||
|
file.write(article.text)
|
||||||
# New line at the end of the file
|
file.close()
|
||||||
command = "sed -i -e '$a\\' '%s'" % (fileName)
|
except OSError as err:
|
||||||
runInShell(command)
|
print("OS error: {0}".format(err))
|
||||||
|
except:
|
||||||
if number != start:
|
print("Unexpected error:", sys.exc_info()[0])
|
||||||
# New lne at beginning of the file
|
raise
|
||||||
command = "sed -i '1i\\\\' '%s'" % (fileName)
|
|
||||||
runInShell(command)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
Loading…
Reference in New Issue
Block a user