'''This example scrapes `infologistix <https://infologistix.de>`_. Intended usage is crawling company consulting services from the webpage.
Utilizes infologistix/docker-python-selenium:alpine as image to run of.
'''
from typing import Literal
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import pymsteams
[docs]class InfologistixCrawler():
'''Example Crawler for infologistix homepage.
Crawles services of the webpage and returns them.
Parameters
----------
url : str
the url to scrape
headless : bool, default: True
set to true when running in headless environment
Examples
--------
>>> crawler = InfologistixCrawler(url="https://infologistix.de", headless=False)
>>> print(crawler.run())
'''
def __init__(self, url: str, headless: bool=True) -> None:
options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--window-size=1280,720")
if headless:
options.add_argument("--headless")
self.__driver = Chrome(options=options)
self.__driver.get(url)
[docs] def getServices(self) -> list:
'''Scrapes the services of the webpage from infologistix GmbH
Returns
-------
list
unsorted list of dict-like service structures
'''
results = list()
WebDriverWait(self.__driver, 10).until(EC.presence_of_element_located((By.ID, "Leistungen")))
services: WebElement = self.__driver.find_element(By.ID, "Leistungen")
service: WebElement
for service in services.find_elements(By.TAG_NAME, "section"):
results.append(self.__extract(service.find_element(By.CLASS_NAME, "elementor-image-box-content")))
return results
def __extract(self, service: WebElement) -> dict:
'''Extracts the services from each section element.
Contains URI, title and description for each service.
Parameters
----------
service : WebElement
a services WebElement to scrape information
Returns
-------
dict
information on dict-like basis
'''
return {
"URI" : service.find_element(By.TAG_NAME, "a").get_attribute("href"),
"Title" : service.find_element(By.TAG_NAME, "a").text,
"Description" : service.find_element(By.TAG_NAME, "p").text,
}
[docs] def makeFrame(self, services: list) -> pd.DataFrame:
'''Converts the list into a human readable table format.
Parameters
----------
services : list
unsorted list of services
Returns
-------
pd.DataFrame
table friendly services
'''
return pd.DataFrame(services)
[docs] def run(self) -> pd.DataFrame:
'''Runs the Crawler and performs actions in the right order.
Returns
-------
pd.DataFrame
table friendly services
'''
services = self.getServices()
self.close()
return self.makeFrame(services)
[docs] def close(self) -> None:
'''Closes the driver.
'''
self.__driver.close()
[docs]def sendMSTeams(webhook : str, message : str, title : str) -> Literal[True]:
'''Send a message to a Teams channel.
Needs a configured webhook for MS Teams.
Parameters
----------
webhook : str
webhook URI to connect to
message : str
a message. Can be Text, Markdown or HTML
title : str
the messages title
Returns
-------
Literal[True]
message was sent
'''
channel = pymsteams.connectorcard(webhook)
channel.title(title)
channel.text(message)
return channel.send()
if __name__ == "__main__":
services = InfologistixCrawler(url="https://infologistix.de").run()
print(services)
# make a html table
message = services.to_html()
title = "Dienstleistungen Infologistix GmbH"
# comment out if you have a webhook.
#sendMSTeams("webhook", message=message, title=title)