Rewrtie using python libraries, use tabs

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
2018-01-03 16:12:09 +01:00 · 2018-01-03 16:12:09 +01:00 · c09655294a
commit c09655294a
parent 0cb72ef82d
2 changed files with 120 additions and 58 deletions
--- a/README.org
+++ b/README.org
@ -1,12 +1,13 @@
 * NovelDownloader
-Script to download each chapter of the webnovel, merge them together and convert to epub
+Script to download each chapter of the webnovel and save all as epub.

 * Dependencies
- ebook-convert (Calibre)
+- python-beautifulsoup4
+- python-ebooklib
 - python-newspaper3k

 * Usage
-Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
+Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script.

 * Author`s comment
-Slowly getting rid of external commands.
+Basic function is done, requires refactoring.
--- a/novelDownloader.py
+++ b/novelDownloader.py
@ -6,18 +6,18 @@ together to create one ebook.

 import errno
 import os
-import shutil
-import subprocess
-import sys
-from newspaper import Article
+import re

+from bs4 import BeautifulSoup
+from ebooklib import epub
+from newspaper import Article

 # Book name
 name = "Name"
 # Base url, this is used as url = mainUrl + <number of chapter>
 mainUrl = "http://example.com/chapter-"
 # Number of all chapter
-chapters = 1
+numberOfChapters = 1
 # Start from 0 or 1 ?
 fromZero = False

@ -25,67 +25,128 @@ fromZero = False
 # numerical representation of start for script
 start = 0 if fromZero else 1

+re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)

-def runInShell(command):
-  """Run giver string in shell"""
-  process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
-  process.wait()
-  process.communicate()
+chapters = []


-def download(mainUrl, name, number):
-  """Download webpage, extract main article, save result"""
-  fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
-  url = "%s%d" % (mainUrl, number)
+def download(number):
+	"""Download webpage, extract main article, save result"""
+	url = "%s%d" % (mainUrl, number)

-  article = Article(url)
-  article.download()
-  article.parse()
+	article = Article(url)
+	article.download()
+	article.parse()

-  # save file
-  try:
-    file = open(fileName, "w")
-    file.write(article.text)
-    file.close()
-  except OSError as err:
-    print("OS error: {0}".format(err))
-  except:
-    print("Unexpected error:", sys.exc_info()[0])
-    raise
+	chapterText = "%s - %d" % (name, number)
+	header = False
+
+	for match in re_paragraph.finditer(article.text):
+		paragraph = match.group()
+		paragraph = paragraph.strip()
+
+		if paragraph != "Previous ChapterNext Chapter":
+			if not header:
+				chapterText += "<h2>%s</h2>" % (paragraph)
+				header = True
+			else:
+				chapterText += "<p>%s</p>\n" % (paragraph)
+
+	chapterHtml = BeautifulSoup(
+		"""<html>
+		<head>
+				<title>...</title>
+				<link rel="stylesheet" type="text/css" href="style/main.css" />
+		</head>
+		<body></body>
+		</html>""",
+		'lxml'
+	)
+	chapterHtml.head.title.string = article.title
+	chapterHtml.body.append(chapterText)
+
+	return str(chapterText)
+
+
+def packageEbook():
+	ebook = epub.EpubBook()
+	ebook.set_identifier("ebook-%s" % name)
+	ebook.set_title(name)
+	ebook.set_language('en')
+	doc_style = epub.EpubItem(
+		uid="doc_style",
+		file_name="style/main.css",
+		media_type="text/css",
+		content=open("template/style.css").read()
+	)
+	ebook.add_item(doc_style)
+
+	intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
+	intro_ch.add_item(doc_style)
+	intro_ch.content = """
+	<html>
+	<head>
+			<title>Introduction</title>
+			<link rel="stylesheet" href="style/main.css" type="text/css" />
+	</head>
+	<body>
+			<h1>%s</h1>
+	</body>
+	</html>
+	""" % (name)
+	ebook.add_item(intro_ch)
+
+	ebookChapters = []
+
+	i = start
+	for chapter_data in chapters:
+		chapter = epub.EpubHtml(
+			title="%s - %d" % (name, i),
+			file_name='%s-%d.xhtml' % (name, i)
+		)
+		chapter.add_item(doc_style)
+		chapter.content = chapter_data
+		ebook.add_item(chapter)
+		ebookChapters.append(chapter)
+
+		i += 1
+
+	# Set the TOC
+	ebook.toc = (
+		epub.Link('intro.xhtml', 'Introduction', 'intro'),
+		(epub.Section('Chapters'), ebookChapters)
+	)
+	# add navigation files
+	ebook.add_item(epub.EpubNcx())
+	ebook.add_item(epub.EpubNav())
+
+	# Create spine
+	nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
+	nav_page.add_item(doc_style)
+	ebook.add_item(nav_page)
+	ebook.spine = [intro_ch, nav_page] + ebookChapters
+
+	filename = '%s.epub' % (name)
+	print("Saving to '%s'" % filename)
+	if os.path.exists(filename):
+			os.remove(filename)
+	epub.write_epub(filename, ebook, {})


 def main():
-  """Start main downloader and converter"""
-  try:
-    os.makedirs("chapters")
-  except OSError as e:
-    if e.errno != errno.EEXIST:
-      raise
+	"""Start main downloader and converter"""

-  # Download all chapters one by one
-  print("Downloading...")
-  for i in range(start, chapters + 1, 1):
-    print("Downloading: ", name, i)
-    download(mainUrl, name, i)
+	# Download all chapters one by one
+	print("Downloading...")
+	for i in range(start, numberOfChapters + 1, 1):
+		print("Downloading: ", name, i)
+		chapters.append(download(i))

-  # merge all chapter to one file
-  print("Merging...")
-  command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
-  runInShell(command)
+	packageEbook()

-  # convert to epub
-  print("Converting...")
-  command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
-  runInShell(command)
-
-  # remove download directory
-  print("Removing temporary data...")
-  shutil.rmtree('chapters')
-  os.remove("%s.txt" % (name))
-
-  print("Done")
+	print("Done")


 # if yused standalone start the script
 if __name__ == '__main__':
-  main()
+	main()