reworked db initialisation to separate a test and prod db

2026-07-21 18:40:54 +00:00 · 2025-03-24 12:20:37 +01:00
parent dbb80e207b
commit 4ae2a5ace6
4 changed files with 58 additions and 34 deletions
@@ -1,10 +1,9 @@
-import os
 import sqlite3
 from enum import Enum

 from logger import LoggerWrapper

-DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
+# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')

 log = LoggerWrapper()

@@ -22,7 +21,7 @@ class Database:
    A class to handle the database connection and operations
    """

-    def __init__(self, db_name: str = DATABASE_PATH):
+    def __init__(self, db_name: str):
        """Initialize the connection to the database"""
        self.db_name = db_name
        self.conn = sqlite3.connect(db_name)
@@ -66,8 +65,18 @@ class Database:
        self.cursor.execute(f'''
        CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
            track_id TEXT PRIMARY KEY,
-            attribute_name TEXT,
-            attribute_value TEXT
+            acousticness FLOAT,
+            danceability FLOAT,
+            duration_ms INTEGER,
+            energy FLOAT,
+            instrumentalness FLOAT,
+            key INTEGER,
+            liveness FLOAT,
+            loudness FLOAT,
+            speechiness FLOAT,
+            tempo FLOAT,
+            time_signature INTEGER,
+            valence FLOAT
        );
        ''')

@@ -79,7 +88,8 @@ class Database:
            album_id TEXT,
            FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
            FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
-            FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
+            FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
+            FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
        );
        ''')

@@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information

 # Define the absolute folder path to the folder containing the gdrp retrieved data
 folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
-# Define the db
-db = Database()
+
 log = LoggerWrapper()


@@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued):
    return all_songs_played


-def _insert_data_into_db(all_songs_played: list):
+def _insert_data_into_db(db: Database, all_songs_played: list):
    """
    This function takes a list of all played songs and inserts these into the database.

@@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list):
            log.error(f'Failed adding {entry} to database, error {e}')


-def export_gdpr_data(n_limit: int = 100) -> None:
+def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
    all_songs_played = _read_gdrp_data()
    all_songs_played = all_songs_played[-n_limit:]
    all_songs_catalogued = _populate_ids(all_songs_played)
    all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
-    _insert_data_into_db(all_songs_played)
-    db.close(__name__)
+    _insert_data_into_db(db, all_songs_played)
@@ -1,9 +1,11 @@
 import argparse
 import atexit
+import os
 import sys
 import traceback
 from time import sleep

+from database_handler import Database
 from gdpr_export import export_gdpr_data
 from logger import LoggerWrapper
 from scraper import scrape_missing_infos, scraping
@@ -49,19 +51,25 @@ if args.verbose:
    log.set_console_handler_to_debug()
    log.info('Enabled verbose mode')

+db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
+
 if args.export == 'TEST':
    export_size = 200
    log.info(f'Scraping GDPR Data. Sample size: {export_size}')
-    export_gdpr_data(export_size)
-    scrape_missing_infos()
+    db = Database(db_path)
+    export_gdpr_data(db, export_size)
+    scrape_missing_infos(db)
 elif args.export == 'PRODUCTION':
    export_size = 1000000
    log.info('Scraping all GDPR Data.')
-    export_gdpr_data(export_size)
-    scrape_missing_infos()
+    db = Database(db_path)
+    export_gdpr_data(db, export_size)
+    scrape_missing_infos(db)
+else:
+    raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')

 while True:
    log.info('Scraping API...')
-    scraping()
+    scraping(db)
    log.info('Done scraping API. Sleeping for 30 minutes...')
    sleep(1800)
@@ -3,12 +3,10 @@ from database_handler import Database, Table
 from logger import LoggerWrapper
 from spotify_api import get_last_played_track, get_multiple_field_information

-# Define DB
-db = Database()
 log = LoggerWrapper()


-def scraping() -> None:
+def scraping(db: Database) -> None:
    """
    This function is the main function that will be executed when the script is run
    """
@@ -16,15 +14,14 @@ def scraping() -> None:
    scope = "user-read-recently-played"
    bearer_token = authenticate(scope)

-    _read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
-    scrape_missing_infos()
+    _read_recently_played_page_and_add_to_db(db, bearer_token)
+    scrape_missing_infos(db)


-def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
+def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
    """
    This function gets a list of song play history and adds it into the database.
    """
-    global db

    last_played_track = get_last_played_track(bearer_token=bearer_token)

@@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
                  f"\nReturned Value: {last_played_track}")


-def scrape_missing_infos() -> None:
+def scrape_missing_infos(db: Database) -> None:
    """

    """
    bearer_token_simple = simple_authenticate()

-    _process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
-    _process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
-    _process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
+    _process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
+    _process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
+    _process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
+    # _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')


-def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
+def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:

    if endpoint_name == 'albums':
        limit = 20
+    elif endpoint_name == 'audio-features':
+        limit = 100
    else:
        limit = 50

@@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_
            ids_tuple = tuple(ids)
            ids.clear()
            response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
-            _add_data_to_database(table_name, response)
+            _add_data_to_database(db, table_name, response)
            counter = 0

    if len(ids) > 0:
        ids_tuple = tuple(ids)
        ids.clear()
        response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
-        _add_data_to_database(table_name, response)
+        _add_data_to_database(db, table_name, response)


-def _add_data_to_database(table_name: Table, response) -> None:
-
-    global db
+def _add_data_to_database(db: Database, table_name: Table, response) -> None:

    if table_name == Table.TRACK_INFORMATION:
        log.debug('Adding track information to database')
@@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None:
            except IndexError:
                genre = ""
            db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
+
+    elif table_name == Table.TRACK_ATTRIBUTES:
+        log.debug('Adding track attributes to database')
+        for entry in response['audio_features']:
+            log.debug(f"Adding track attributes: {entry['id']}")
+            try:
+                db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
+            except Exception as e:
+                log.error(f"Failed to add track attributes to database: {e}"
+                          f"\nReturned Value: {response}")