mirror of
https://github.com/agresdominik/predictify.git
synced 2026-04-21 17:55:49 +00:00
reworked db initialisation to separate a test and prod db
This commit is contained in:
+16
-6
@@ -1,10 +1,9 @@
|
||||
import os
|
||||
import sqlite3
|
||||
from enum import Enum
|
||||
|
||||
from logger import LoggerWrapper
|
||||
|
||||
DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
|
||||
# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
@@ -22,7 +21,7 @@ class Database:
|
||||
A class to handle the database connection and operations
|
||||
"""
|
||||
|
||||
def __init__(self, db_name: str = DATABASE_PATH):
|
||||
def __init__(self, db_name: str):
|
||||
"""Initialize the connection to the database"""
|
||||
self.db_name = db_name
|
||||
self.conn = sqlite3.connect(db_name)
|
||||
@@ -66,8 +65,18 @@ class Database:
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
|
||||
track_id TEXT PRIMARY KEY,
|
||||
attribute_name TEXT,
|
||||
attribute_value TEXT
|
||||
acousticness FLOAT,
|
||||
danceability FLOAT,
|
||||
duration_ms INTEGER,
|
||||
energy FLOAT,
|
||||
instrumentalness FLOAT,
|
||||
key INTEGER,
|
||||
liveness FLOAT,
|
||||
loudness FLOAT,
|
||||
speechiness FLOAT,
|
||||
tempo FLOAT,
|
||||
time_signature INTEGER,
|
||||
valence FLOAT
|
||||
);
|
||||
''')
|
||||
|
||||
@@ -79,7 +88,8 @@ class Database:
|
||||
album_id TEXT,
|
||||
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
|
||||
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
|
||||
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
|
||||
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
|
||||
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
|
||||
);
|
||||
''')
|
||||
|
||||
|
||||
+4
-6
@@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information
|
||||
|
||||
# Define the absolute folder path to the folder containing the gdrp retrieved data
|
||||
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
|
||||
# Define the db
|
||||
db = Database()
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
@@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued):
|
||||
return all_songs_played
|
||||
|
||||
|
||||
def _insert_data_into_db(all_songs_played: list):
|
||||
def _insert_data_into_db(db: Database, all_songs_played: list):
|
||||
"""
|
||||
This function takes a list of all played songs and inserts these into the database.
|
||||
|
||||
@@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list):
|
||||
log.error(f'Failed adding {entry} to database, error {e}')
|
||||
|
||||
|
||||
def export_gdpr_data(n_limit: int = 100) -> None:
|
||||
def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
|
||||
all_songs_played = _read_gdrp_data()
|
||||
all_songs_played = all_songs_played[-n_limit:]
|
||||
all_songs_catalogued = _populate_ids(all_songs_played)
|
||||
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
|
||||
_insert_data_into_db(all_songs_played)
|
||||
db.close(__name__)
|
||||
_insert_data_into_db(db, all_songs_played)
|
||||
|
||||
+13
-5
@@ -1,9 +1,11 @@
|
||||
import argparse
|
||||
import atexit
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from time import sleep
|
||||
|
||||
from database_handler import Database
|
||||
from gdpr_export import export_gdpr_data
|
||||
from logger import LoggerWrapper
|
||||
from scraper import scrape_missing_infos, scraping
|
||||
@@ -49,19 +51,25 @@ if args.verbose:
|
||||
log.set_console_handler_to_debug()
|
||||
log.info('Enabled verbose mode')
|
||||
|
||||
db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
|
||||
|
||||
if args.export == 'TEST':
|
||||
export_size = 200
|
||||
log.info(f'Scraping GDPR Data. Sample size: {export_size}')
|
||||
export_gdpr_data(export_size)
|
||||
scrape_missing_infos()
|
||||
db = Database(db_path)
|
||||
export_gdpr_data(db, export_size)
|
||||
scrape_missing_infos(db)
|
||||
elif args.export == 'PRODUCTION':
|
||||
export_size = 1000000
|
||||
log.info('Scraping all GDPR Data.')
|
||||
export_gdpr_data(export_size)
|
||||
scrape_missing_infos()
|
||||
db = Database(db_path)
|
||||
export_gdpr_data(db, export_size)
|
||||
scrape_missing_infos(db)
|
||||
else:
|
||||
raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')
|
||||
|
||||
while True:
|
||||
log.info('Scraping API...')
|
||||
scraping()
|
||||
scraping(db)
|
||||
log.info('Done scraping API. Sleeping for 30 minutes...')
|
||||
sleep(1800)
|
||||
|
||||
+25
-17
@@ -3,12 +3,10 @@ from database_handler import Database, Table
|
||||
from logger import LoggerWrapper
|
||||
from spotify_api import get_last_played_track, get_multiple_field_information
|
||||
|
||||
# Define DB
|
||||
db = Database()
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def scraping() -> None:
|
||||
def scraping(db: Database) -> None:
|
||||
"""
|
||||
This function is the main function that will be executed when the script is run
|
||||
"""
|
||||
@@ -16,15 +14,14 @@ def scraping() -> None:
|
||||
scope = "user-read-recently-played"
|
||||
bearer_token = authenticate(scope)
|
||||
|
||||
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
|
||||
scrape_missing_infos()
|
||||
_read_recently_played_page_and_add_to_db(db, bearer_token)
|
||||
scrape_missing_infos(db)
|
||||
|
||||
|
||||
def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
|
||||
def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
|
||||
"""
|
||||
This function gets a list of song play history and adds it into the database.
|
||||
"""
|
||||
global db
|
||||
|
||||
last_played_track = get_last_played_track(bearer_token=bearer_token)
|
||||
|
||||
@@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
|
||||
f"\nReturned Value: {last_played_track}")
|
||||
|
||||
|
||||
def scrape_missing_infos() -> None:
|
||||
def scrape_missing_infos(db: Database) -> None:
|
||||
"""
|
||||
|
||||
"""
|
||||
bearer_token_simple = simple_authenticate()
|
||||
|
||||
_process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
|
||||
_process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
|
||||
_process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
|
||||
_process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
|
||||
_process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
|
||||
_process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
|
||||
# _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')
|
||||
|
||||
|
||||
def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
|
||||
def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
|
||||
|
||||
if endpoint_name == 'albums':
|
||||
limit = 20
|
||||
elif endpoint_name == 'audio-features':
|
||||
limit = 100
|
||||
else:
|
||||
limit = 50
|
||||
|
||||
@@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_
|
||||
ids_tuple = tuple(ids)
|
||||
ids.clear()
|
||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||
_add_data_to_database(table_name, response)
|
||||
_add_data_to_database(db, table_name, response)
|
||||
counter = 0
|
||||
|
||||
if len(ids) > 0:
|
||||
ids_tuple = tuple(ids)
|
||||
ids.clear()
|
||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||
_add_data_to_database(table_name, response)
|
||||
_add_data_to_database(db, table_name, response)
|
||||
|
||||
|
||||
def _add_data_to_database(table_name: Table, response) -> None:
|
||||
|
||||
global db
|
||||
def _add_data_to_database(db: Database, table_name: Table, response) -> None:
|
||||
|
||||
if table_name == Table.TRACK_INFORMATION:
|
||||
log.debug('Adding track information to database')
|
||||
@@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None:
|
||||
except IndexError:
|
||||
genre = ""
|
||||
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
|
||||
|
||||
elif table_name == Table.TRACK_ATTRIBUTES:
|
||||
log.debug('Adding track attributes to database')
|
||||
for entry in response['audio_features']:
|
||||
log.debug(f"Adding track attributes: {entry['id']}")
|
||||
try:
|
||||
db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
|
||||
except Exception as e:
|
||||
log.error(f"Failed to add track attributes to database: {e}"
|
||||
f"\nReturned Value: {response}")
|
||||
|
||||
Reference in New Issue
Block a user