From 87f84b250e5183019f3aca51ca488e2662e54ce6 Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 23:16:26 +0100 Subject: [PATCH] Logging and documentation in scraper --- src/logger.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/scraper.py | 43 ++++++++++++++++++++++++-------------- 2 files changed, 84 insertions(+), 15 deletions(-) create mode 100644 src/logger.py diff --git a/src/logger.py b/src/logger.py new file mode 100644 index 0000000..29a6531 --- /dev/null +++ b/src/logger.py @@ -0,0 +1,56 @@ +import logging +import os +from logging.handlers import RotatingFileHandler +from pathlib import Path + + +class LoggerWrapper(): + + def __init__(self, logger_name: str = "standard_logger"): + + self.logger = logging.getLogger(logger_name) + self.logger.setLevel(logging.DEBUG) + self.setup_logger() + + def setup_logger(self): + # Define and create folder + logs_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'logs') + Path(logs_folder).mkdir(parents=True, exist_ok=True) + + # Define file path + log_file = log_file = os.path.join(logs_folder, 'predictify.log') + + # Setup File Handler + handler = RotatingFileHandler(log_file, maxBytes=1000000, backupCount=5) + handler.setLevel(logging.DEBUG) + + # Setup Console Handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.WARNING) + + # Setup Formatter + formatter = logging.Formatter('%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s') + + # Add Formatters to Handlers + handler.setFormatter(formatter) + console_handler.setFormatter(formatter) + + # Add Handlers to Logger + self.logger.addHandler(handler) + self.logger.addHandler(console_handler) + + def info(self, message): + self.logger.info(message) + + def debug(self, message): + self.logger.debug(message) + + def warning(self, message): + self.logger.warning(message) + + def error(self, message): + self.logger.error(message) + + def critical(self, message): + self.logger.critical(message) + # Here we can add alerting/handling diff --git a/src/scraper.py b/src/scraper.py index 489b72f..bee9dba 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,16 +1,17 @@ from auth import authenticate, simple_authenticate from database_handler import Database, Table +from logger import LoggerWrapper from spotify_api import get_last_played_track, get_multiple_field_information # Define DB db = Database() +log = LoggerWrapper() -def scraping(): +def scraping() -> None: """ This function is the main function that will be executed when the script is run """ - global db scope = "user-read-recently-played" bearer_token = authenticate(scope) @@ -19,7 +20,7 @@ def scraping(): scrape_missing_infos() -def _read_recently_played_page_and_add_to_db(bearer_token: str): +def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None: """ This function gets a list of song play history and adds it into the database. """ @@ -27,26 +28,30 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str): last_played_track = get_last_played_track(bearer_token=bearer_token) - for track in reversed(last_played_track['items']): - track_id = track['track']['id'] - played_at = track['played_at'] - album_id = track['track']['album']['id'] - artist_id = track['track']['artists'][0]['id'] - db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id)) + try: + for track in reversed(last_played_track['items']): + track_id = track['track']['id'] + played_at = track['played_at'] + album_id = track['track']['album']['id'] + artist_id = track['track']['artists'][0]['id'] + db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id)) + except Exception as e: + log.error(f"Failed to add returned play history to database: {e}" + f"\nReturned Value: {last_played_track}") -def scrape_missing_infos(): +def scrape_missing_infos() -> None: """ """ bearer_token_simple = simple_authenticate() - _scrape_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') - _scrape_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') - _scrape_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') + _process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') + _process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') + _process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') -def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str): +def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None: if endpoint_name == 'albums': limit = 20 @@ -57,6 +62,8 @@ def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_n all_ids_saved = db.read_all_rows(table_name, id_field_name) all_ids_missing = list(set(all_ids_recently_played) - set(all_ids_saved)) + log.debug(f"Number of missing {table_name.name} entries: {len(all_ids_missing)}. Inserting...") + ids = [] processed_ids = set() @@ -85,16 +92,20 @@ def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_n _add_data_to_database(table_name, response) -def _add_data_to_database(table_name: Table, response): +def _add_data_to_database(table_name: Table, response) -> None: global db if table_name == Table.TRACK_INFORMATION: + log.debug('Adding track information to database') for entry in response['tracks']: + log.debug(f"Adding track: {entry['name']}") db.add_row(table_name, (entry['id'], entry['name'], entry['duration_ms'], entry['explicit'], entry['popularity'])) elif table_name == Table.ALBUM_INFORMATION: + log.debug('Adding album information to database') for entry in response['albums']: + log.debug(f"Adding album: {entry['name']}") try: release_year = entry['release_date'][:4] except Exception: @@ -102,7 +113,9 @@ def _add_data_to_database(table_name: Table, response): db.add_row(table_name, (entry['id'], entry['name'], entry['album_type'], entry['total_tracks'], release_year, entry['label'])) elif table_name == Table.ARTIST_INFORMATION: + log.debug('Adding artist information to database') for entry in response['artists']: + log.debug(f"Adding artist: {entry['name']}") try: genre = entry['genres'][0] except IndexError: