reworked db initialisation to separate a test and prod db

This commit is contained in:
agres
2025-03-24 12:20:37 +01:00
parent dbb80e207b
commit 4ae2a5ace6
4 changed files with 58 additions and 34 deletions
+16 -6
View File
@@ -1,10 +1,9 @@
import os
import sqlite3
from enum import Enum
from logger import LoggerWrapper
DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
log = LoggerWrapper()
@@ -22,7 +21,7 @@ class Database:
A class to handle the database connection and operations
"""
def __init__(self, db_name: str = DATABASE_PATH):
def __init__(self, db_name: str):
"""Initialize the connection to the database"""
self.db_name = db_name
self.conn = sqlite3.connect(db_name)
@@ -66,8 +65,18 @@ class Database:
self.cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
track_id TEXT PRIMARY KEY,
attribute_name TEXT,
attribute_value TEXT
acousticness FLOAT,
danceability FLOAT,
duration_ms INTEGER,
energy FLOAT,
instrumentalness FLOAT,
key INTEGER,
liveness FLOAT,
loudness FLOAT,
speechiness FLOAT,
tempo FLOAT,
time_signature INTEGER,
valence FLOAT
);
''')
@@ -79,7 +88,8 @@ class Database:
album_id TEXT,
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
);
''')
+4 -6
View File
@@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information
# Define the absolute folder path to the folder containing the gdrp retrieved data
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
# Define the db
db = Database()
log = LoggerWrapper()
@@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued):
return all_songs_played
def _insert_data_into_db(all_songs_played: list):
def _insert_data_into_db(db: Database, all_songs_played: list):
"""
This function takes a list of all played songs and inserts these into the database.
@@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list):
log.error(f'Failed adding {entry} to database, error {e}')
def export_gdpr_data(n_limit: int = 100) -> None:
def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
all_songs_played = _read_gdrp_data()
all_songs_played = all_songs_played[-n_limit:]
all_songs_catalogued = _populate_ids(all_songs_played)
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
_insert_data_into_db(all_songs_played)
db.close(__name__)
_insert_data_into_db(db, all_songs_played)
+13 -5
View File
@@ -1,9 +1,11 @@
import argparse
import atexit
import os
import sys
import traceback
from time import sleep
from database_handler import Database
from gdpr_export import export_gdpr_data
from logger import LoggerWrapper
from scraper import scrape_missing_infos, scraping
@@ -49,19 +51,25 @@ if args.verbose:
log.set_console_handler_to_debug()
log.info('Enabled verbose mode')
db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
if args.export == 'TEST':
export_size = 200
log.info(f'Scraping GDPR Data. Sample size: {export_size}')
export_gdpr_data(export_size)
scrape_missing_infos()
db = Database(db_path)
export_gdpr_data(db, export_size)
scrape_missing_infos(db)
elif args.export == 'PRODUCTION':
export_size = 1000000
log.info('Scraping all GDPR Data.')
export_gdpr_data(export_size)
scrape_missing_infos()
db = Database(db_path)
export_gdpr_data(db, export_size)
scrape_missing_infos(db)
else:
raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')
while True:
log.info('Scraping API...')
scraping()
scraping(db)
log.info('Done scraping API. Sleeping for 30 minutes...')
sleep(1800)
+25 -17
View File
@@ -3,12 +3,10 @@ from database_handler import Database, Table
from logger import LoggerWrapper
from spotify_api import get_last_played_track, get_multiple_field_information
# Define DB
db = Database()
log = LoggerWrapper()
def scraping() -> None:
def scraping(db: Database) -> None:
"""
This function is the main function that will be executed when the script is run
"""
@@ -16,15 +14,14 @@ def scraping() -> None:
scope = "user-read-recently-played"
bearer_token = authenticate(scope)
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
scrape_missing_infos()
_read_recently_played_page_and_add_to_db(db, bearer_token)
scrape_missing_infos(db)
def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
"""
This function gets a list of song play history and adds it into the database.
"""
global db
last_played_track = get_last_played_track(bearer_token=bearer_token)
@@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
f"\nReturned Value: {last_played_track}")
def scrape_missing_infos() -> None:
def scrape_missing_infos(db: Database) -> None:
"""
"""
bearer_token_simple = simple_authenticate()
_process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
_process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
_process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
_process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
_process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
_process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
# _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')
def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
if endpoint_name == 'albums':
limit = 20
elif endpoint_name == 'audio-features':
limit = 100
else:
limit = 50
@@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_
ids_tuple = tuple(ids)
ids.clear()
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
_add_data_to_database(table_name, response)
_add_data_to_database(db, table_name, response)
counter = 0
if len(ids) > 0:
ids_tuple = tuple(ids)
ids.clear()
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
_add_data_to_database(table_name, response)
_add_data_to_database(db, table_name, response)
def _add_data_to_database(table_name: Table, response) -> None:
global db
def _add_data_to_database(db: Database, table_name: Table, response) -> None:
if table_name == Table.TRACK_INFORMATION:
log.debug('Adding track information to database')
@@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None:
except IndexError:
genre = ""
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
elif table_name == Table.TRACK_ATTRIBUTES:
log.debug('Adding track attributes to database')
for entry in response['audio_features']:
log.debug(f"Adding track attributes: {entry['id']}")
try:
db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
except Exception as e:
log.error(f"Failed to add track attributes to database: {e}"
f"\nReturned Value: {response}")