NovelDownloader/parsers/boxnovel_com.py

86 lines
2.9 KiB
Python
Raw Normal View History

import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List
class Parser:
def __init__(self, url : str):
self.indexUrl : str = url
self.chapterUrls : List[str] = list()
self.indexPage : BeautifulSoup = None
self.chapterCycle : cycle = None
def prepare(self):
self.downloadIndex()
self.parseIndex()
def nextChapter(self):
url = next(self.chapterCycle)
response = requests.get(url)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
page = BeautifulSoup(response.text, "lxml")
# Extract chapter
content = page.find("div", {"class": "reading-content"})
if content is None:
raise RuntimeError('Failed to find a chapter content <div class="reading-content">')
# Get title
bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
titleText = titleText.text
try:
number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
title = title.strip()
except Exception:
number = -1
title = titleText
self.decompose(content.findChildren("script", recursive=True))
self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))
return number, title.strip(), content.contents[1]
def getAuthor(self):
author = self.indexPage.find("div", {"class": "author-content"})
return author.text.strip()
def getTitle(self):
title = self.indexPage.find("div", {"class": "post-title"})
return title.text.strip()
def getUrls(self):
return self.chapterUrls
def decompose(self, objects):
for obj in objects:
obj.decompose()
def downloadIndex(self):
response = requests.get(self.indexUrl)
if response.status_code != 200:
raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
self.indexPage = BeautifulSoup(response.text, "lxml")
def parseIndex(self):
chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
if chapterListTag is None:
raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')
chapters = chapterListTag.findChildren("a", recursive=True)
if chapters is None:
raise RuntimeError('Failed to find a links to chapters <a>')
for chapter in chapters:
self.chapterUrls.append(chapter.get('href'))
self.chapterUrls.reverse()
self.chapterCycle = cycle(self.chapterUrls)