slovak_driver_tests/autoskola.py

#!/user/bin/env python
# ============================================================
# Slovak Driver Tests extractor
# Copyright (C) 2025 Juraj Oravec <jurajoravec@mailo.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ============================================================


import argparse
import importlib
import os
import shutil
from urllib.request import urlretrieve

SCRIPT_VERSION = '0.2.0'

LANGUAGE_INDEX_SK: int = 0
LANGUAGE_INDEX_EN: int = 1
LANGUAGE_INDEX_HU: int = 2

configuration: dict = {
    "language": LANGUAGE_INDEX_EN,
    "force_data_download": False
}

BASE_URL = "https://www.minv.sk/egovinet02/PCPZobrazFile?fileName=pcpfiles/"
TESTS_DATA_URL: str = BASE_URL + "data5.js"


class Autoskola():
    def __init__(self):
        self.okruhy: tuple = (1, 9, 11, 19, 23, 24, 27, 29, 31, 39)
        self.dataFilenameTmp: str = "data.js"
        self.dataFilename: str = "autoskola_data.py"
        self.rawData: dict = dict()
        self.parsedData: list = list()

    def get_index(self, id_otazka):
        id: int = int(id_otazka)

        index: int = 0
        for okruh in self.okruhy:
            if id <= (okruh - 1):
                break
            index += 1

        return index - 1

    def otazka_exist_id(self, otazka_id: str, okruh_id: int):
        data_list: list = self.parsedData[okruh_id]["data"]

        for dic in data_list:
            if otazka_id == dic["otazka_id"]:
                return True

        return False

    def downloadData(self):
        if not os.path.exists(self.dataFilenameTmp) or configuration["force_data_download"]:
            urlretrieve(TESTS_DATA_URL, self.dataFilenameTmp)

        if os.path.exists(self.dataFilenameTmp):
            # Remove leading "var " from the file
            with open(self.dataFilenameTmp) as readFile:
                readFile.read(4)

                with open(self.dataFilename, 'w') as writeFile:
                    shutil.copyfileobj(readFile, writeFile)

    def loadData(self):
        self.dataModule = importlib.import_module('autoskola_data')
        self.rawData = self.dataModule.data

    def downloadImages(self):
        images: list = []
        index: int = 0
        for okruh in self.okruhy:
            for dic in self.parsedData[index]["data"]:
                if dic["obrazok"] and dic["obrazok"] not in images:
                    images.append(dic["obrazok"])
            index += 1

        dirs: list = []
        for obr in images:
            paths = obr.split("/")
            path = paths[0] + '/' + paths[1]

            if (len(paths) == 2):
                path = paths[0]

            if path not in dirs:
                dirs.append(path)
                if not os.path.exists('images/' + path):
                    os.makedirs('images/' + path)

            imagePath = 'images/' + obr;
            if configuration["force_data_download"] and os.path.exists(imagePath):
                os.remove(imagePath)

            if not os.path.exists(imagePath):
                urlretrieve(BASE_URL + obr, imagePath)

    def parseData(self):
        for data_set in self.rawData[configuration["language"]]:
            # Initialize the dataset
            if not len(self.parsedData):
                for index, data in data_set["okruhy"].items():
                    self.parsedData.append({
                        "nazov": data[0]["txt"],
                        "data": []
                    })

            for index, data in data_set["otazky"].items():
                okruh_id = self.get_index(index)

                if not self.otazka_exist_id(data[0]["id"], okruh_id):
                    self.parsedData[okruh_id]["data"].append({
                        "cislo_otazky": index,
                        "otazka_id": data[0]["id"],
                        "otazka": data[0]["text"],
                        "odpoved": data_set["odpovede"][index][data[0]["platna"] - 1]["odpoved"],
                        "obrazok": data[0]["obrazok"]
                    })

    def print_to_html(self):
        okruhy_limits: tuple = (
            1000,
            1000 + 525,
            1000 + 525 + 75,
            1000 + 525 + 75 + 189,
            1000 + 525 + 75 + 189 + 89,
            1000 + 525 + 75 + 189 + 89 + 28,
            1000 + 525 + 75 + 189 + 89 + 28 + 92,
            1000 + 525 + 75 + 189 + 89 + 28 + 92 + 69,
            1000 + 525 + 75 + 189 + 89 + 28 + 92 + 69 + 59,
            1000 + 525 + 75 + 189 + 89 + 28 + 92 + 69 + 59 + 158
        )
        okruhy_temp: list = []
        okruhy_sorted: list = []
        index: int = 0
        for okruh in self.okruhy:
            okruhy_temp.append([])
            okruhy_sorted.append([])
            for dic in self.parsedData[index]["data"]:
                okruhy_temp[index].append(dic["otazka_id"])
            okruhy_sorted[index] = sorted(okruhy_temp[index])

            index += 1

        styles = """
            table {
                border-spacing: 4px;
                border: 1px solid black;
                border: 1px solid black;
            }
            table td, table th {
                padding: 5px;
                border: 1px solid black;
            }
            img {
                max-height: 100px;
                float: left;
                margin-right: 1rem;
            }

            .section_4 img {
                max-height: 150px;
            }
            @media print {
                .pagebreak {
                    clear: both;
                    page-break-after: always;
                }
                tr {
                    page-break-inside: avoid;
                }
                nav {
                    display: none;
                }
            }
        """

        print("<html><head>")
        print("<title>Car test data</title>")
        print("<style>{styles}</style></head><body><h1>Car Test data</h1>".format(styles=styles))

        print("<nav><ol>")
        index = 0
        for okruh in self.okruhy:
            print("<li><a href='#section{section_id}'>{section_name}</a></li>".format(
                section_id=(index + 1),
                section_name=self.parsedData[index]["nazov"])
            )
            index += 1
        print("</ol></nav>")

        index = 0
        for okruh in self.okruhy:
            if index != 0:
                print("<div class='pagebreak'> </div>")

            print("<a name='section{section}'></a>".format(section=(index + 1)))
            print("<table class='section_{section_id}'>".format(section_id=(index + 1)))
            print("<caption>{section_id}. {caption}</caption>".format(
                section_id=(index + 1),
                caption=self.parsedData[index]["nazov"])
            )

            print("<thead><tr><th>Id</th><th>Question</th><th>Answer</th></tr></thead>")

            for sorted_id in okruhy_sorted[index]:
                print("<tr>")
                obrazok: str = ''
                otazka_data: dict = {}

                for dic in self.parsedData[index]["data"]:
                    if sorted_id == dic["otazka_id"]:
                        otazka_data = dic
                        break

                if otazka_data["obrazok"]:
                    obrazok = "<a href='images/{image}' target='_blank'><img src='images/{image}'></a>".format(
                        image=otazka_data["obrazok"]
                    )

                print("<td>{otazka_id}</td>".format(
                    otazka_id=(sorted_id + 1 - okruhy_limits[index]))
                )
                print("<td>{obrazok}{question}</td>".format(
                    obrazok=obrazok,
                    question=otazka_data["otazka"])
                )
                print("<td>{answer}</td>".format(answer=otazka_data["odpoved"]))
                print("</tr>")

            print("</table>")

            index += 1

        print("</body></html>")


def main():
    parser = argparse.ArgumentParser(
        description='Downloads and extracts the data from the Slovak online driver tests. '
                    'Outputs the resulted HTML file into the stdout.',
    )
    parser.add_argument('-l', '--language', dest='language', action='store',
                        default=str(LANGUAGE_INDEX_EN),
                        help='Test language: '
                             '0,SK - Slovak; '
                             '1,EN - English; '
                             '2,HU - Hungarian;')
    parser.add_argument('-f', '--force-data-download', dest='force_data_download', action='store_true',
                        help='Download data even when the files already exist.')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {version}'.format(version=SCRIPT_VERSION))

    args = parser.parse_args()

    if (args.language == 'SK' or int(args.language) == LANGUAGE_INDEX_SK):
        configuration["language"] = LANGUAGE_INDEX_SK
    elif (args.language == 'EN' or int(args.language) == LANGUAGE_INDEX_EN):
        configuration["language"] = LANGUAGE_INDEX_EN
    elif (args.language == 'HU' or int(args.language) == LANGUAGE_INDEX_HU):
        configuration["language"] = LANGUAGE_INDEX_HU

    configuration["force_data_download"] = args.force_data_download

    autoskola = Autoskola()
    autoskola.downloadData()
    autoskola.loadData()
    autoskola.parseData()
    autoskola.downloadImages()
    autoskola.print_to_html()


if __name__ == "__main__":
    main()