Rewrtie using python libraries, use tabs

Signed-off-by: Juraj Oravec <sgd.orava@gmail.com>
2018-01-03 16:12:09 +01:00 · 2018-01-03 16:12:09 +01:00 · c09655294a
commit c09655294a
parent 0cb72ef82d
2 changed files with 120 additions and 58 deletions
--- a/README.org
+++ b/README.org
@ -1,12 +1,13 @@
 * NovelDownloader
-Script to download each chapter of the webnovel, merge them together and convert to epub
+Script to download each chapter of the webnovel and save all as epub.
 * Dependencies
- ebook-convert (Calibre)
+- python-beautifulsoup4
 - python-ebooklib
 - python-newspaper3k
 * Usage
-Set variables name, mainUrl, chapters and fromZero at the beginning of the python file and run the script
+Set variables name, mainUrl, numberOfChapters and fromZero at the beginning of the python file and run the script.
 * Author`s comment
-Slowly getting rid of external commands.
+Basic function is done, requires refactoring.
--- a/novelDownloader.py
+++ b/novelDownloader.py
@ -6,18 +6,18 @@ together to create one ebook.
 import errno
 import os
-import shutil
+import re
 import subprocess
 import sys
 from newspaper import Article
 from bs4 import BeautifulSoup
 from ebooklib import epub
 from newspaper import Article
 # Book name
 name = "Name"
 # Base url, this is used as url = mainUrl + <number of chapter>
 mainUrl = "http://example.com/chapter-"
 # Number of all chapter
-chapters = 1
+numberOfChapters = 1
 # Start from 0 or 1 ?
 fromZero = False
@ -25,67 +25,128 @@ fromZero = False
 # numerical representation of start for script
 start = 0 if fromZero else 1
 re_paragraph = re.compile(r"(.+?\n\n|.+?$)", re.MULTILINE)
-def runInShell(command):
+chapters = []
  """Run giver string in shell"""
  process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
  process.wait()
  process.communicate()
-def download(mainUrl, name, number):
+def download(number):
-  """Download webpage, extract main article, save result"""
+	"""Download webpage, extract main article, save result"""
-  fileName = os.path.join('chapters', '%s-%d.txt' % (name, number))
+	url = "%s%d" % (mainUrl, number)
  url = "%s%d" % (mainUrl, number)
-  article = Article(url)
+	article = Article(url)
-  article.download()
+	article.download()
-  article.parse()
+	article.parse()
-  # save file
+	chapterText = "%s - %d" % (name, number)
-  try:
+	header = False
-    file = open(fileName, "w")
+
-    file.write(article.text)
+	for match in re_paragraph.finditer(article.text):
-    file.close()
+		paragraph = match.group()
-  except OSError as err:
+		paragraph = paragraph.strip()
-    print("OS error: {0}".format(err))
+
-  except:
+		if paragraph != "Previous ChapterNext Chapter":
-    print("Unexpected error:", sys.exc_info()[0])
+			if not header:
-    raise
+				chapterText += "<h2>%s</h2>" % (paragraph)
 				header = True
 			else:
 				chapterText += "<p>%s</p>\n" % (paragraph)
 	chapterHtml = BeautifulSoup(
 		"""<html>
 		<head>
 				<title>...</title>
 				<link rel="stylesheet" type="text/css" href="style/main.css" />
 		</head>
 		<body></body>
 		</html>""",
 		'lxml'
 	)
 	chapterHtml.head.title.string = article.title
 	chapterHtml.body.append(chapterText)
 	return str(chapterText)
 def packageEbook():
 	ebook = epub.EpubBook()
 	ebook.set_identifier("ebook-%s" % name)
 	ebook.set_title(name)
 	ebook.set_language('en')
 	doc_style = epub.EpubItem(
 		uid="doc_style",
 		file_name="style/main.css",
 		media_type="text/css",
 		content=open("template/style.css").read()
 	)
 	ebook.add_item(doc_style)
 	intro_ch = epub.EpubHtml(title="Introduction", file_name='intro.xhtml')
 	intro_ch.add_item(doc_style)
 	intro_ch.content = """
 	<html>
 	<head>
 			<title>Introduction</title>
 			<link rel="stylesheet" href="style/main.css" type="text/css" />
 	</head>
 	<body>
 			<h1>%s</h1>
 	</body>
 	</html>
 	""" % (name)
 	ebook.add_item(intro_ch)
 	ebookChapters = []
 	i = start
 	for chapter_data in chapters:
 		chapter = epub.EpubHtml(
 			title="%s - %d" % (name, i),
 			file_name='%s-%d.xhtml' % (name, i)
 		)
 		chapter.add_item(doc_style)
 		chapter.content = chapter_data
 		ebook.add_item(chapter)
 		ebookChapters.append(chapter)
 		i += 1
 	# Set the TOC
 	ebook.toc = (
 		epub.Link('intro.xhtml', 'Introduction', 'intro'),
 		(epub.Section('Chapters'), ebookChapters)
 	)
 	# add navigation files
 	ebook.add_item(epub.EpubNcx())
 	ebook.add_item(epub.EpubNav())
 	# Create spine
 	nav_page = epub.EpubNav(uid='book_toc', file_name='toc.xhtml')
 	nav_page.add_item(doc_style)
 	ebook.add_item(nav_page)
 	ebook.spine = [intro_ch, nav_page] + ebookChapters
 	filename = '%s.epub' % (name)
 	print("Saving to '%s'" % filename)
 	if os.path.exists(filename):
 			os.remove(filename)
 	epub.write_epub(filename, ebook, {})
 def main():
-  """Start main downloader and converter"""
+	"""Start main downloader and converter"""
  try:
    os.makedirs("chapters")
  except OSError as e:
    if e.errno != errno.EEXIST:
      raise
-  # Download all chapters one by one
+	# Download all chapters one by one
-  print("Downloading...")
+	print("Downloading...")
-  for i in range(start, chapters + 1, 1):
+	for i in range(start, numberOfChapters + 1, 1):
-    print("Downloading: ", name, i)
+		print("Downloading: ", name, i)
-    download(mainUrl, name, i)
+		chapters.append(download(i))
-  # merge all chapter to one file
+	packageEbook()
  print("Merging...")
  command = 'cat "chapters/%s-"{%d..%d}.txt > "%s.txt"' % (name, start, chapters, name)
  runInShell(command)
-  # convert to epub
+	print("Done")
  print("Converting...")
  command = 'ebook-convert "%s.txt" "%s.epub"' % (name, name)
  runInShell(command)
  # remove download directory
  print("Removing temporary data...")
  shutil.rmtree('chapters')
  os.remove("%s.txt" % (name))
  print("Done")
 # if yused standalone start the script
 if __name__ == '__main__':
-  main()
+	main()