Source code for main

'''This example scrapes `infologistix <https://infologistix.de>`_. Intended usage is crawling company consulting services from the webpage.

Utilizes infologistix/docker-python-selenium:alpine as image to run of.

'''
from typing import Literal
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd 
import pymsteams

[docs]class InfologistixCrawler():
    '''Example Crawler for infologistix homepage.
    Crawles services of the webpage and returns them. 

    Parameters
    ----------
    url : str
        the url to scrape
    headless : bool, default: True
        set to true when running in headless environment

    Examples
    --------
    >>> crawler = InfologistixCrawler(url="https://infologistix.de", headless=False)
    >>> print(crawler.run())
    '''
    def __init__(self, url: str, headless: bool=True) -> None:
        options = ChromeOptions()
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1280,720")
        if headless:
            options.add_argument("--headless")
        self.__driver = Chrome(options=options)
        self.__driver.get(url)

[docs]    def getServices(self) -> list:
        '''Scrapes the services of the webpage from infologistix GmbH

        Returns
        -------
        list
            unsorted list of dict-like service structures
        '''
        results = list()
        WebDriverWait(self.__driver, 10).until(EC.presence_of_element_located((By.ID, "Leistungen")))
        services: WebElement = self.__driver.find_element(By.ID, "Leistungen")
        service: WebElement
        for service in services.find_elements(By.TAG_NAME, "section"):
            results.append(self.__extract(service.find_element(By.CLASS_NAME, "elementor-image-box-content")))
        return results

    def __extract(self, service: WebElement) -> dict:
        '''Extracts the services from each section element.
        Contains URI, title and description for each service.

        Parameters
        ----------
        service : WebElement
            a services WebElement to scrape information

        Returns
        -------
        dict
            information on dict-like basis
        '''
        return {
            "URI" : service.find_element(By.TAG_NAME, "a").get_attribute("href"),
            "Title" : service.find_element(By.TAG_NAME, "a").text,
            "Description" : service.find_element(By.TAG_NAME, "p").text,
        }

[docs]    def makeFrame(self, services: list) -> pd.DataFrame:
        '''Converts the list into a human readable table format.

        Parameters
        ----------
        services : list
            unsorted list of services

        Returns
        -------
        pd.DataFrame
            table friendly services
        '''
        return pd.DataFrame(services)

[docs]    def run(self) -> pd.DataFrame:
        '''Runs the Crawler and performs actions in the right order.

        Returns
        -------
        pd.DataFrame
            table friendly services
        '''
        services = self.getServices()
        self.close()
        return self.makeFrame(services)

[docs]    def close(self) -> None:
        '''Closes the driver.
        '''
        self.__driver.close()



[docs]def sendMSTeams(webhook : str, message : str, title : str) -> Literal[True]:
    '''Send a message to a Teams channel. 
    Needs a configured webhook for MS Teams.

    Parameters
    ----------
    webhook : str
        webhook URI to connect to
    message : str
        a message. Can be Text, Markdown or HTML
    title : str
        the messages title

    Returns
    -------
    Literal[True]
        message was sent
    '''
    channel = pymsteams.connectorcard(webhook)
    channel.title(title)
    channel.text(message)
    return channel.send()


if __name__ == "__main__":
    services = InfologistixCrawler(url="https://infologistix.de").run()
    print(services)
    # make a html table 
    message = services.to_html()
    title = "Dienstleistungen Infologistix GmbH"
    # comment out if you have a webhook.
    #sendMSTeams("webhook", message=message, title=title)