86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
|
import re
|
||
|
import requests
|
||
|
from bs4 import BeautifulSoup
|
||
|
from itertools import cycle
|
||
|
from typing import List
|
||
|
|
||
|
|
||
|
class Parser:
|
||
|
def __init__(self, url : str):
|
||
|
self.indexUrl : str = url
|
||
|
|
||
|
self.chapterUrls : List[str] = list()
|
||
|
self.indexPage : BeautifulSoup = None
|
||
|
self.chapterCycle : cycle = None
|
||
|
|
||
|
def prepare(self):
|
||
|
self.downloadIndex()
|
||
|
self.parseIndex()
|
||
|
|
||
|
def nextChapter(self):
|
||
|
url = next(self.chapterCycle)
|
||
|
|
||
|
response = requests.get(url)
|
||
|
if response.status_code != 200:
|
||
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
||
|
page = BeautifulSoup(response.text, "lxml")
|
||
|
|
||
|
# Extract chapter
|
||
|
content = page.find("div", {"class": "reading-content"})
|
||
|
if content is None:
|
||
|
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
|
||
|
|
||
|
# Get title
|
||
|
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
|
||
|
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
|
||
|
titleText = titleText.text
|
||
|
|
||
|
try:
|
||
|
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
|
||
|
title = title.strip()
|
||
|
except Exception:
|
||
|
number = -1
|
||
|
title = titleText
|
||
|
|
||
|
self.decompose(content.findChildren("script", recursive=True))
|
||
|
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
|
||
|
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
|
||
|
|
||
|
return number, title.strip(), content.contents[1]
|
||
|
|
||
|
def getAuthor(self):
|
||
|
author = self.indexPage.find("div", {"class": "author-content"})
|
||
|
return author.text.strip()
|
||
|
|
||
|
def getTitle(self):
|
||
|
title = self.indexPage.find("div", {"class": "post-title"})
|
||
|
return title.text.strip()
|
||
|
|
||
|
def getUrls(self):
|
||
|
return self.chapterUrls
|
||
|
|
||
|
def decompose(self, objects):
|
||
|
for obj in objects:
|
||
|
obj.decompose()
|
||
|
|
||
|
def downloadIndex(self):
|
||
|
response = requests.get(self.indexUrl)
|
||
|
if response.status_code != 200:
|
||
|
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
|
||
|
|
||
|
self.indexPage = BeautifulSoup(response.text, "lxml")
|
||
|
|
||
|
def parseIndex(self):
|
||
|
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
|
||
|
if chapterListTag is None:
|
||
|
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
|
||
|
|
||
|
chapters = chapterListTag.findChildren("a", recursive=True)
|
||
|
if chapters is None:
|
||
|
raise RuntimeError('Failed to find a links to chapters <a>')
|
||
|
|
||
|
for chapter in chapters:
|
||
|
self.chapterUrls.append(chapter.get('href'))
|
||
|
self.chapterUrls.reverse()
|
||
|
self.chapterCycle = cycle(self.chapterUrls)
|