Juraj Oravec
d8a8122bdc
- Only one module is available for now: boxnovel.com - There a re still some problems but it is getting better Signed-off-by: Juraj Oravec <jurajoravec@mailo.com>
86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from itertools import cycle
|
|
from typing import List
|
|
|
|
|
|
class Parser:
|
|
def __init__(self, url : str):
|
|
self.indexUrl : str = url
|
|
|
|
self.chapterUrls : List[str] = list()
|
|
self.indexPage : BeautifulSoup = None
|
|
self.chapterCycle : cycle = None
|
|
|
|
def prepare(self):
|
|
self.downloadIndex()
|
|
self.parseIndex()
|
|
|
|
def nextChapter(self):
|
|
url = next(self.chapterCycle)
|
|
|
|
response = requests.get(url)
|
|
if response.status_code != 200:
|
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
|
page = BeautifulSoup(response.text, "lxml")
|
|
|
|
# Extract chapter
|
|
content = page.find("div", {"class": "reading-content"})
|
|
if content is None:
|
|
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
|
|
|
|
# Get title
|
|
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
|
|
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
|
|
titleText = titleText.text
|
|
|
|
try:
|
|
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
|
|
title = title.strip()
|
|
except Exception:
|
|
number = -1
|
|
title = titleText
|
|
|
|
self.decompose(content.findChildren("script", recursive=True))
|
|
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
|
|
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
|
|
|
|
return number, title.strip(), content.contents[1]
|
|
|
|
def getAuthor(self):
|
|
author = self.indexPage.find("div", {"class": "author-content"})
|
|
return author.text.strip()
|
|
|
|
def getTitle(self):
|
|
title = self.indexPage.find("div", {"class": "post-title"})
|
|
return title.text.strip()
|
|
|
|
def getUrls(self):
|
|
return self.chapterUrls
|
|
|
|
def decompose(self, objects):
|
|
for obj in objects:
|
|
obj.decompose()
|
|
|
|
def downloadIndex(self):
|
|
response = requests.get(self.indexUrl)
|
|
if response.status_code != 200:
|
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
|
|
|
self.indexPage = BeautifulSoup(response.text, "lxml")
|
|
|
|
def parseIndex(self):
|
|
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
|
|
if chapterListTag is None:
|
|
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
|
|
|
|
chapters = chapterListTag.findChildren("a", recursive=True)
|
|
if chapters is None:
|
|
raise RuntimeError('Failed to find a links to chapters <a>')
|
|
|
|
for chapter in chapters:
|
|
self.chapterUrls.append(chapter.get('href'))
|
|
self.chapterUrls.reverse()
|
|
self.chapterCycle = cycle(self.chapterUrls)
|