import re import requests from bs4 import BeautifulSoup from itertools import cycle from typing import List class Parser: def __init__(self, url : str): self.indexUrl : str = url self.chapterUrls : List[str] = list() self.indexPage : BeautifulSoup = None self.chapterCycle : cycle = None def prepare(self): self.downloadIndex() self.parseIndex() def nextChapter(self): url = next(self.chapterCycle) response = requests.get(url) if response.status_code != 200: raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl)) page = BeautifulSoup(response.text, "lxml") # Extract chapter content = page.find("div", {"class": "reading-content"}) if content is None: raise RuntimeError('Failed to find a chapter content
') # Get title bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True) titleText = bcol.findChild("li", {"class": "active"}, recursive=True) titleText = titleText.text try: number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0] title = title.strip() except Exception: number = -1 title = titleText self.decompose(content.findChildren("script", recursive=True)) self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True)) self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True)) return number, title.strip(), content.contents[1] def getAuthor(self): author = self.indexPage.find("div", {"class": "author-content"}) return author.text.strip() def getTitle(self): title = self.indexPage.find("div", {"class": "post-title"}) return title.text.strip() def getUrls(self): return self.chapterUrls def decompose(self, objects): for obj in objects: obj.decompose() def downloadIndex(self): response = requests.get(self.indexUrl) if response.status_code != 200: raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl)) self.indexPage = BeautifulSoup(response.text, "lxml") def parseIndex(self): chapterListTag = self.indexPage.find("ul", {"class": "version-chap"}) if chapterListTag is None: raise RuntimeError('Failed to find a chapter list