NovelDownloader/parsers/boxnovel_com.py

import re
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from typing import List


class Parser:
    def __init__(self, url : str):
        self.indexUrl : str = url

        self.chapterUrls : List[str] = list()
        self.indexPage : BeautifulSoup = None
        self.chapterCycle : cycle = None

    def prepare(self):
        self.downloadIndex()
        self.parseIndex()

    def nextChapter(self):
        url = next(self.chapterCycle)

        response = requests.get(url)
        if response.status_code != 200:
            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))
        page = BeautifulSoup(response.text, "lxml")

        # Extract chapter
        content = page.find("div", {"class": "reading-content"})
        if content is None:
            raise RuntimeError('Failed to find a chapter content <div class="reading-content">')

        # Get title
        bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)
        titleText = bcol.findChild("li", {"class": "active"}, recursive=True)
        titleText = titleText.text

        try:
            number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]
            title = title.strip()
        except Exception:
            number = -1
            title = titleText

        self.decompose(content.findChildren("script", recursive=True))
        self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))
        self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))

        return number, title.strip(), content.contents[1]

    def getAuthor(self):
        author = self.indexPage.find("div", {"class": "author-content"})
        return author.text.strip()

    def getTitle(self):
        title = self.indexPage.find("div", {"class": "post-title"})
        return title.text.strip()

    def getUrls(self):
        return self.chapterUrls

    def decompose(self, objects):
        for obj in objects:
            obj.decompose()

    def downloadIndex(self):
        response = requests.get(self.indexUrl)
        if response.status_code != 200:
            raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))

        self.indexPage = BeautifulSoup(response.text, "lxml")

    def parseIndex(self):
        chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})
        if chapterListTag is None:
            raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')

        chapters = chapterListTag.findChildren("a", recursive=True)
        if chapters is None:
            raise RuntimeError('Failed to find a links to chapters <a>')

        for chapter in chapters:
            self.chapterUrls.append(chapter.get('href'))
        self.chapterUrls.reverse()
        self.chapterCycle = cycle(self.chapterUrls)
Rewrite to a more modular style - Only one module is available for now: boxnovel.com - There a re still some problems but it is getting better Signed-off-by: Juraj Oravec <jurajoravec@mailo.com> 2020-12-21 03:03:32 +01:00			`import re`
			`import requests`
			`from bs4 import BeautifulSoup`
			`from itertools import cycle`
			`from typing import List`


			`class Parser:`
			`def __init__(self, url : str):`
			`self.indexUrl : str = url`

			`self.chapterUrls : List[str] = list()`
			`self.indexPage : BeautifulSoup = None`
			`self.chapterCycle : cycle = None`

			`def prepare(self):`
			`self.downloadIndex()`
			`self.parseIndex()`

			`def nextChapter(self):`
			`url = next(self.chapterCycle)`

			`response = requests.get(url)`
			`if response.status_code != 200:`
			`raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))`
			`page = BeautifulSoup(response.text, "lxml")`

			`# Extract chapter`
			`content = page.find("div", {"class": "reading-content"})`
			`if content is None:`
			`raise RuntimeError('Failed to find a chapter content <div class="reading-content">')`

			`# Get title`
			`bcol = page.findChild("ol", {"class": "breadcrumb"}, recursive=True)`
			`titleText = bcol.findChild("li", {"class": "active"}, recursive=True)`
			`titleText = titleText.text`

			`try:`
			`number, title = re.findall('^Chapter\s?([0-9]+)[^\s]?(.*)', titleText)[0]`
			`title = title.strip()`
			`except Exception:`
			`number = -1`
			`title = titleText`

			`self.decompose(content.findChildren("script", recursive=True))`
			`self.decompose(content.findChildren("ins", {"class": "adsbygoogle"}, recursive=True))`
			`self.decompose(content.findChildren("div", {"data-endpoint": "//trends.revcontent.com"}, recursive=True))`

			`return number, title.strip(), content.contents[1]`

			`def getAuthor(self):`
			`author = self.indexPage.find("div", {"class": "author-content"})`
			`return author.text.strip()`

			`def getTitle(self):`
			`title = self.indexPage.find("div", {"class": "post-title"})`
			`return title.text.strip()`

			`def getUrls(self):`
			`return self.chapterUrls`

			`def decompose(self, objects):`
			`for obj in objects:`
			`obj.decompose()`

			`def downloadIndex(self):`
			`response = requests.get(self.indexUrl)`
			`if response.status_code != 200:`
			`raise FileNotFoundError("Unable to download {url}".format(url=self.indexUrl))`

			`self.indexPage = BeautifulSoup(response.text, "lxml")`

			`def parseIndex(self):`
			`chapterListTag = self.indexPage.find("ul", {"class": "version-chap"})`
			`if chapterListTag is None:`
			`raise RuntimeError('Failed to find a chapter list <ul class="version-chap">')`

			`chapters = chapterListTag.findChildren("a", recursive=True)`
			`if chapters is None:`
			`raise RuntimeError('Failed to find a links to chapters <a>')`

			`for chapter in chapters:`
			`self.chapterUrls.append(chapter.get('href'))`
			`self.chapterUrls.reverse()`
			`self.chapterCycle = cycle(self.chapterUrls)`