NovelDownloader/parsers/boxnovel_com.py

import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List


class Parser:
    def __init__(self, url : str):
        self.indexUrl : str = url

        self.chapterUrls : List[str] = list()
        self.indexPage : BeautifulSoup = None
        self.chapterCycle : cycle = None

    def prepare(self):
        self.downloadIndex()
        self.parseIndex()

    def nextChapter(self):
        url = next(self.chapterCycle)

        response = requests.get(url)
        if response.status_code != 200:
            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
        page = BeautifulSoup(response.text, "lxml")

        # Extract chapter
        content = page.find("div", {"class": "reading-content"})
        if content is None:
            raise RuntimeError('Failed to find a chapter content <div class="reading-content">')

        # Get title
        bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
        titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
        titleText = titleText.text

        try:
            number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
            title = title.strip()
        except Exception:
            number = -1
            title = titleText

        self.decompose(content.findChildren("script", recursive=True))
        self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
        self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))

        return number, title.strip(), content.contents[1]

    def getAuthor(self):
        author = self.indexPage.find("div", {"class": "author-content"})
        return author.text.strip()

    def getTitle(self):
        title = self.indexPage.find("div", {"class": "post-title"})
        return title.text.strip()

    def getUrls(self):
        return self.chapterUrls

    def decompose(self, objects):
        for obj in objects:
            obj.decompose()

    def downloadIndex(self):
        response = requests.get(self.indexUrl)
        if response.status_code != 200:
            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))

        self.indexPage = BeautifulSoup(response.text, "lxml")

    def parseIndex(self):
        chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
        if chapterListTag is None:
            raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')

        chapters = chapterListTag.findChildren("a", recursive=True)
        if chapters is None:
            raise RuntimeError('Failed to find a links to chapters <a>')

        for chapter in chapters:
            self.chapterUrls.append(chapter.get('href'))
        self.chapterUrls.reverse()
        self.chapterCycle = cycle(self.chapterUrls)