reworked db initialisation to separate a test and prod db

This commit is contained in:
agres
2025-03-24 12:20:37 +01:00
parent dbb80e207b
commit 4ae2a5ace6
4 changed files with 58 additions and 34 deletions
+16 -6
View File
@@ -1,10 +1,9 @@
import os
import sqlite3 import sqlite3
from enum import Enum from enum import Enum
from logger import LoggerWrapper from logger import LoggerWrapper
DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db') # DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
log = LoggerWrapper() log = LoggerWrapper()
@@ -22,7 +21,7 @@ class Database:
A class to handle the database connection and operations A class to handle the database connection and operations
""" """
def __init__(self, db_name: str = DATABASE_PATH): def __init__(self, db_name: str):
"""Initialize the connection to the database""" """Initialize the connection to the database"""
self.db_name = db_name self.db_name = db_name
self.conn = sqlite3.connect(db_name) self.conn = sqlite3.connect(db_name)
@@ -66,8 +65,18 @@ class Database:
self.cursor.execute(f''' self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} ( CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
track_id TEXT PRIMARY KEY, track_id TEXT PRIMARY KEY,
attribute_name TEXT, acousticness FLOAT,
attribute_value TEXT danceability FLOAT,
duration_ms INTEGER,
energy FLOAT,
instrumentalness FLOAT,
key INTEGER,
liveness FLOAT,
loudness FLOAT,
speechiness FLOAT,
tempo FLOAT,
time_signature INTEGER,
valence FLOAT
); );
''') ''')
@@ -79,7 +88,8 @@ class Database:
album_id TEXT, album_id TEXT,
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id), FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id), FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id) FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
); );
''') ''')
+4 -6
View File
@@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information
# Define the absolute folder path to the folder containing the gdrp retrieved data # Define the absolute folder path to the folder containing the gdrp retrieved data
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data') folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
# Define the db
db = Database()
log = LoggerWrapper() log = LoggerWrapper()
@@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued):
return all_songs_played return all_songs_played
def _insert_data_into_db(all_songs_played: list): def _insert_data_into_db(db: Database, all_songs_played: list):
""" """
This function takes a list of all played songs and inserts these into the database. This function takes a list of all played songs and inserts these into the database.
@@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list):
log.error(f'Failed adding {entry} to database, error {e}') log.error(f'Failed adding {entry} to database, error {e}')
def export_gdpr_data(n_limit: int = 100) -> None: def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
all_songs_played = _read_gdrp_data() all_songs_played = _read_gdrp_data()
all_songs_played = all_songs_played[-n_limit:] all_songs_played = all_songs_played[-n_limit:]
all_songs_catalogued = _populate_ids(all_songs_played) all_songs_catalogued = _populate_ids(all_songs_played)
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued) all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
_insert_data_into_db(all_songs_played) _insert_data_into_db(db, all_songs_played)
db.close(__name__)
+13 -5
View File
@@ -1,9 +1,11 @@
import argparse import argparse
import atexit import atexit
import os
import sys import sys
import traceback import traceback
from time import sleep from time import sleep
from database_handler import Database
from gdpr_export import export_gdpr_data from gdpr_export import export_gdpr_data
from logger import LoggerWrapper from logger import LoggerWrapper
from scraper import scrape_missing_infos, scraping from scraper import scrape_missing_infos, scraping
@@ -49,19 +51,25 @@ if args.verbose:
log.set_console_handler_to_debug() log.set_console_handler_to_debug()
log.info('Enabled verbose mode') log.info('Enabled verbose mode')
db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
if args.export == 'TEST': if args.export == 'TEST':
export_size = 200 export_size = 200
log.info(f'Scraping GDPR Data. Sample size: {export_size}') log.info(f'Scraping GDPR Data. Sample size: {export_size}')
export_gdpr_data(export_size) db = Database(db_path)
scrape_missing_infos() export_gdpr_data(db, export_size)
scrape_missing_infos(db)
elif args.export == 'PRODUCTION': elif args.export == 'PRODUCTION':
export_size = 1000000 export_size = 1000000
log.info('Scraping all GDPR Data.') log.info('Scraping all GDPR Data.')
export_gdpr_data(export_size) db = Database(db_path)
scrape_missing_infos() export_gdpr_data(db, export_size)
scrape_missing_infos(db)
else:
raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')
while True: while True:
log.info('Scraping API...') log.info('Scraping API...')
scraping() scraping(db)
log.info('Done scraping API. Sleeping for 30 minutes...') log.info('Done scraping API. Sleeping for 30 minutes...')
sleep(1800) sleep(1800)
+25 -17
View File
@@ -3,12 +3,10 @@ from database_handler import Database, Table
from logger import LoggerWrapper from logger import LoggerWrapper
from spotify_api import get_last_played_track, get_multiple_field_information from spotify_api import get_last_played_track, get_multiple_field_information
# Define DB
db = Database()
log = LoggerWrapper() log = LoggerWrapper()
def scraping() -> None: def scraping(db: Database) -> None:
""" """
This function is the main function that will be executed when the script is run This function is the main function that will be executed when the script is run
""" """
@@ -16,15 +14,14 @@ def scraping() -> None:
scope = "user-read-recently-played" scope = "user-read-recently-played"
bearer_token = authenticate(scope) bearer_token = authenticate(scope)
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token) _read_recently_played_page_and_add_to_db(db, bearer_token)
scrape_missing_infos() scrape_missing_infos(db)
def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None: def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
""" """
This function gets a list of song play history and adds it into the database. This function gets a list of song play history and adds it into the database.
""" """
global db
last_played_track = get_last_played_track(bearer_token=bearer_token) last_played_track = get_last_played_track(bearer_token=bearer_token)
@@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
f"\nReturned Value: {last_played_track}") f"\nReturned Value: {last_played_track}")
def scrape_missing_infos() -> None: def scrape_missing_infos(db: Database) -> None:
""" """
""" """
bearer_token_simple = simple_authenticate() bearer_token_simple = simple_authenticate()
_process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') _process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
_process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') _process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
_process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') _process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
# _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')
def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None: def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
if endpoint_name == 'albums': if endpoint_name == 'albums':
limit = 20 limit = 20
elif endpoint_name == 'audio-features':
limit = 100
else: else:
limit = 50 limit = 50
@@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_
ids_tuple = tuple(ids) ids_tuple = tuple(ids)
ids.clear() ids.clear()
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
_add_data_to_database(table_name, response) _add_data_to_database(db, table_name, response)
counter = 0 counter = 0
if len(ids) > 0: if len(ids) > 0:
ids_tuple = tuple(ids) ids_tuple = tuple(ids)
ids.clear() ids.clear()
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
_add_data_to_database(table_name, response) _add_data_to_database(db, table_name, response)
def _add_data_to_database(table_name: Table, response) -> None: def _add_data_to_database(db: Database, table_name: Table, response) -> None:
global db
if table_name == Table.TRACK_INFORMATION: if table_name == Table.TRACK_INFORMATION:
log.debug('Adding track information to database') log.debug('Adding track information to database')
@@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None:
except IndexError: except IndexError:
genre = "" genre = ""
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity'])) db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
elif table_name == Table.TRACK_ATTRIBUTES:
log.debug('Adding track attributes to database')
for entry in response['audio_features']:
log.debug(f"Adding track attributes: {entry['id']}")
try:
db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
except Exception as e:
log.error(f"Failed to add track attributes to database: {e}"
f"\nReturned Value: {response}")