diff --git a/README.org b/README.org index e4c867e..f64e01e 100644 --- a/README.org +++ b/README.org @@ -1,12 +1,13 @@ * NovelDownloader -Script to download each chapter of the webnovel, merge them together and convert to epub +Script to download each chapter of the webnovel and save all as epub. * Dependencies -- ebook-convert (Calibre) +- python-beautifulsoup4 +- python-ebooklib - python-newspaper3k * Usage -Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script +Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script. * Author`s comment -Slowly getting rid of external commands. +Basic function is done, requires refactoring. diff --git a/novelDownloader.py b/novelDownloader.py index 580b697..3b37780 100755 --- a/novelDownloader.py +++ b/novelDownloader.py @@ -6,18 +6,18 @@ together to create one ebook. import errno import os -import shutil -import subprocess -import sys -from newspaper import Article +import re +from bs4 import BeautifulSoup +from ebooklib import epub +from newspaper import Article # Book name name = "Name" # Base url, this is used as url = mainUrl + mainUrl = "http://example.com/chapter-" # Number of all chapter -chapters = 1 +numberOfChapters = 1 # Start from 0 or 1 ? fromZero = False @@ -25,67 +25,128 @@ fromZero = False # numerical representation of start for script start = 0 if fromZero else 1 +re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE) -def runInShell(command): - """Run giver string in shell""" - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) - process.wait() - process.communicate() +chapters = [] -def download(mainUrl, name, number): - """Download webpage, extract main article, save result""" - fileName = os.path.join('chapters', '%s-%d.txt' % (name, number)) - url = "%s%d" % (mainUrl, number) +def download(number): + """Download webpage, extract main article, save result""" + url = "%s%d" % (mainUrl, number) - article = Article(url) - article.download() - article.parse() + article = Article(url) + article.download() + article.parse() - # save file - try: - file = open(fileName, "w") - file.write(article.text) - file.close() - except OSError as err: - print("OS error: {0}".format(err)) - except: - print("Unexpected error:", sys.exc_info()[0]) - raise + chapterText = "%s - %d" % (name, number) + header = False + + for match in re_paragraph.finditer(article.text): + paragraph = match.group() + paragraph = paragraph.strip() + + if paragraph != "Previous ChapterNext Chapter": + if not header: + chapterText += "

%s

" % (paragraph) + header = True + else: + chapterText += "

%s

\n" % (paragraph) + + chapterHtml = BeautifulSoup( + """ + + ... + + + + """, + 'lxml' + ) + chapterHtml.head.title.string = article.title + chapterHtml.body.append(chapterText) + + return str(chapterText) + + +def packageEbook(): + ebook = epub.EpubBook() + ebook.set_identifier("ebook-%s" % name) + ebook.set_title(name) + ebook.set_language('en') + doc_style = epub.EpubItem( + uid="doc_style", + file_name="style/main.css", + media_type="text/css", + content=open("template/style.css").read() + ) + ebook.add_item(doc_style) + + intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml') + intro_ch.add_item(doc_style) + intro_ch.content = """ + + + Introduction + + + +

%s

+ + + """ % (name) + ebook.add_item(intro_ch) + + ebookChapters = [] + + i = start + for chapter_data in chapters: + chapter = epub.EpubHtml( + title="%s - %d" % (name, i), + file_name='%s-%d.xhtml' % (name, i) + ) + chapter.add_item(doc_style) + chapter.content = chapter_data + ebook.add_item(chapter) + ebookChapters.append(chapter) + + i += 1 + + # Set the TOC + ebook.toc = ( + epub.Link('intro.xhtml', 'Introduction', 'intro'), + (epub.Section('Chapters'), ebookChapters) + ) + # add navigation files + ebook.add_item(epub.EpubNcx()) + ebook.add_item(epub.EpubNav()) + + # Create spine + nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml') + nav_page.add_item(doc_style) + ebook.add_item(nav_page) + ebook.spine = [intro_ch, nav_page] + ebookChapters + + filename = '%s.epub' % (name) + print("Saving to '%s'" % filename) + if os.path.exists(filename): + os.remove(filename) + epub.write_epub(filename, ebook, {}) def main(): - """Start main downloader and converter""" - try: - os.makedirs("chapters") - except OSError as e: - if e.errno != errno.EEXIST: - raise + """Start main downloader and converter""" - # Download all chapters one by one - print("Downloading...") - for i in range(start, chapters + 1, 1): - print("Downloading: ", name, i) - download(mainUrl, name, i) + # Download all chapters one by one + print("Downloading...") + for i in range(start, numberOfChapters + 1, 1): + print("Downloading: ", name, i) + chapters.append(download(i)) - # merge all chapter to one file - print("Merging...") - command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name) - runInShell(command) + packageEbook() - # convert to epub - print("Converting...") - command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name) - runInShell(command) - - # remove download directory - print("Removing temporary data...") - shutil.rmtree('chapters') - os.remove("%s.txt" % (name)) - - print("Done") + print("Done") # if yused standalone start the script if __name__ == '__main__': - main() + main()