From 4ae2a5ace65dc2cbb4e02fddafe34e1ee30a2eaa Mon Sep 17 00:00:00 2001 From: agres Date: Mon, 24 Mar 2025 12:20:37 +0100 Subject: [PATCH] reworked db initialisation to separate a test and prod db --- src/database_handler.py | 22 +++++++++++++++------ src/gdpr_export.py | 10 ++++------ src/runtime.py | 18 +++++++++++++----- src/scraper.py | 42 ++++++++++++++++++++++++----------------- 4 files changed, 58 insertions(+), 34 deletions(-) diff --git a/src/database_handler.py b/src/database_handler.py index 9b4fdab..773549c 100644 --- a/src/database_handler.py +++ b/src/database_handler.py @@ -1,10 +1,9 @@ -import os import sqlite3 from enum import Enum from logger import LoggerWrapper -DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db') +# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db') log = LoggerWrapper() @@ -22,7 +21,7 @@ class Database: A class to handle the database connection and operations """ - def __init__(self, db_name: str = DATABASE_PATH): + def __init__(self, db_name: str): """Initialize the connection to the database""" self.db_name = db_name self.conn = sqlite3.connect(db_name) @@ -66,8 +65,18 @@ class Database: self.cursor.execute(f''' CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} ( track_id TEXT PRIMARY KEY, - attribute_name TEXT, - attribute_value TEXT + acousticness FLOAT, + danceability FLOAT, + duration_ms INTEGER, + energy FLOAT, + instrumentalness FLOAT, + key INTEGER, + liveness FLOAT, + loudness FLOAT, + speechiness FLOAT, + tempo FLOAT, + time_signature INTEGER, + valence FLOAT ); ''') @@ -79,7 +88,8 @@ class Database: album_id TEXT, FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id), FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id), - FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id) + FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id), + FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id) ); ''') diff --git a/src/gdpr_export.py b/src/gdpr_export.py index 222696a..31ed6b1 100644 --- a/src/gdpr_export.py +++ b/src/gdpr_export.py @@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information # Define the absolute folder path to the folder containing the gdrp retrieved data folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data') -# Define the db -db = Database() + log = LoggerWrapper() @@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued): return all_songs_played -def _insert_data_into_db(all_songs_played: list): +def _insert_data_into_db(db: Database, all_songs_played: list): """ This function takes a list of all played songs and inserts these into the database. @@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list): log.error(f'Failed adding {entry} to database, error {e}') -def export_gdpr_data(n_limit: int = 100) -> None: +def export_gdpr_data(db: Database, n_limit: int = 100) -> None: all_songs_played = _read_gdrp_data() all_songs_played = all_songs_played[-n_limit:] all_songs_catalogued = _populate_ids(all_songs_played) all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued) - _insert_data_into_db(all_songs_played) - db.close(__name__) + _insert_data_into_db(db, all_songs_played) diff --git a/src/runtime.py b/src/runtime.py index b7cbc7b..fc802ec 100644 --- a/src/runtime.py +++ b/src/runtime.py @@ -1,9 +1,11 @@ import argparse import atexit +import os import sys import traceback from time import sleep +from database_handler import Database from gdpr_export import export_gdpr_data from logger import LoggerWrapper from scraper import scrape_missing_infos, scraping @@ -49,19 +51,25 @@ if args.verbose: log.set_console_handler_to_debug() log.info('Enabled verbose mode') +db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db') + if args.export == 'TEST': export_size = 200 log.info(f'Scraping GDPR Data. Sample size: {export_size}') - export_gdpr_data(export_size) - scrape_missing_infos() + db = Database(db_path) + export_gdpr_data(db, export_size) + scrape_missing_infos(db) elif args.export == 'PRODUCTION': export_size = 1000000 log.info('Scraping all GDPR Data.') - export_gdpr_data(export_size) - scrape_missing_infos() + db = Database(db_path) + export_gdpr_data(db, export_size) + scrape_missing_infos(db) +else: + raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.') while True: log.info('Scraping API...') - scraping() + scraping(db) log.info('Done scraping API. Sleeping for 30 minutes...') sleep(1800) diff --git a/src/scraper.py b/src/scraper.py index bee9dba..621b14a 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -3,12 +3,10 @@ from database_handler import Database, Table from logger import LoggerWrapper from spotify_api import get_last_played_track, get_multiple_field_information -# Define DB -db = Database() log = LoggerWrapper() -def scraping() -> None: +def scraping(db: Database) -> None: """ This function is the main function that will be executed when the script is run """ @@ -16,15 +14,14 @@ def scraping() -> None: scope = "user-read-recently-played" bearer_token = authenticate(scope) - _read_recently_played_page_and_add_to_db(bearer_token=bearer_token) - scrape_missing_infos() + _read_recently_played_page_and_add_to_db(db, bearer_token) + scrape_missing_infos(db) -def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None: +def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None: """ This function gets a list of song play history and adds it into the database. """ - global db last_played_track = get_last_played_track(bearer_token=bearer_token) @@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None: f"\nReturned Value: {last_played_track}") -def scrape_missing_infos() -> None: +def scrape_missing_infos(db: Database) -> None: """ """ bearer_token_simple = simple_authenticate() - _process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') - _process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') - _process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') + _process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') + _process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') + _process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') + # _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features') -def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None: +def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None: if endpoint_name == 'albums': limit = 20 + elif endpoint_name == 'audio-features': + limit = 100 else: limit = 50 @@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_ ids_tuple = tuple(ids) ids.clear() response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) - _add_data_to_database(table_name, response) + _add_data_to_database(db, table_name, response) counter = 0 if len(ids) > 0: ids_tuple = tuple(ids) ids.clear() response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) - _add_data_to_database(table_name, response) + _add_data_to_database(db, table_name, response) -def _add_data_to_database(table_name: Table, response) -> None: - - global db +def _add_data_to_database(db: Database, table_name: Table, response) -> None: if table_name == Table.TRACK_INFORMATION: log.debug('Adding track information to database') @@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None: except IndexError: genre = "" db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity'])) + + elif table_name == Table.TRACK_ATTRIBUTES: + log.debug('Adding track attributes to database') + for entry in response['audio_features']: + log.debug(f"Adding track attributes: {entry['id']}") + try: + db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence'])) + except Exception as e: + log.error(f"Failed to add track attributes to database: {e}" + f"\nReturned Value: {response}")