#!/usr/bin/python3 """ Script to downloaded webpages, extract text and merge all of them together to create one ebook. """ import errno import os import re from bs4 import BeautifulSoup from ebooklib import epub from newspaper import Article # Book name name = "Name" # Base url, this is used as url = mainUrl + mainUrl = "http://example.com/chapter-" # Number of all chapter numberOfChapters = 1 # Start from 0 or 1 ? fromZero = False # numerical representation of start for script start = 0 if fromZero else 1 re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE) chapters = [] def download(number): """Download webpage, extract main article, save result""" url = "%s%d" % (mainUrl, number) article = Article(url) article.download() article.parse() chapterText = "%s - %d" % (name, number) header = False for match in re_paragraph.finditer(article.text): paragraph = match.group() paragraph = paragraph.strip() if paragraph != "Previous ChapterNext Chapter": if not header: chapterText += "

%s

" % (paragraph) header = True else: chapterText += "

%s

\n" % (paragraph) chapterHtml = BeautifulSoup( """ ... """, 'lxml' ) chapterHtml.head.title.string = article.title chapterHtml.body.append(chapterText) return str(chapterText) def packageEbook(): ebook = epub.EpubBook() ebook.set_identifier("ebook-%s" % name) ebook.set_title(name) ebook.set_language('en') doc_style = epub.EpubItem( uid="doc_style", file_name="style/main.css", media_type="text/css", content=open("template/style.css").read() ) ebook.add_item(doc_style) intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml') intro_ch.add_item(doc_style) intro_ch.content = """ Introduction

%s

""" % (name) ebook.add_item(intro_ch) ebookChapters = [] i = start for chapter_data in chapters: chapter = epub.EpubHtml( title="%s - %d" % (name, i), file_name='%s-%d.xhtml' % (name, i) ) chapter.add_item(doc_style) chapter.content = chapter_data ebook.add_item(chapter) ebookChapters.append(chapter) i += 1 # Set the TOC ebook.toc = ( epub.Link('intro.xhtml', 'Introduction', 'intro'), (epub.Section('Chapters'), ebookChapters) ) # add navigation files ebook.add_item(epub.EpubNcx()) ebook.add_item(epub.EpubNav()) # Create spine nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml') nav_page.add_item(doc_style) ebook.add_item(nav_page) ebook.spine = [intro_ch, nav_page] + ebookChapters filename = '%s.epub' % (name) print("Saving to '%s'" % filename) if os.path.exists(filename): os.remove(filename) epub.write_epub(filename, ebook, {}) def main(): """Start main downloader and converter""" # Download all chapters one by one print("Downloading...") for i in range(start, numberOfChapters + 1, 1): print("Downloading: ", name, i) chapters.append(download(i)) packageEbook() print("Done") # if yused standalone start the script if __name__ == '__main__': main()