mirror of
https://github.com/agresdominik/predictify.git
synced 2026-04-21 17:55:49 +00:00
reworked db initialisation to separate a test and prod db
This commit is contained in:
+16
-6
@@ -1,10 +1,9 @@
|
|||||||
import os
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from logger import LoggerWrapper
|
from logger import LoggerWrapper
|
||||||
|
|
||||||
DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
|
# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
|
||||||
|
|
||||||
log = LoggerWrapper()
|
log = LoggerWrapper()
|
||||||
|
|
||||||
@@ -22,7 +21,7 @@ class Database:
|
|||||||
A class to handle the database connection and operations
|
A class to handle the database connection and operations
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, db_name: str = DATABASE_PATH):
|
def __init__(self, db_name: str):
|
||||||
"""Initialize the connection to the database"""
|
"""Initialize the connection to the database"""
|
||||||
self.db_name = db_name
|
self.db_name = db_name
|
||||||
self.conn = sqlite3.connect(db_name)
|
self.conn = sqlite3.connect(db_name)
|
||||||
@@ -66,8 +65,18 @@ class Database:
|
|||||||
self.cursor.execute(f'''
|
self.cursor.execute(f'''
|
||||||
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
|
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
|
||||||
track_id TEXT PRIMARY KEY,
|
track_id TEXT PRIMARY KEY,
|
||||||
attribute_name TEXT,
|
acousticness FLOAT,
|
||||||
attribute_value TEXT
|
danceability FLOAT,
|
||||||
|
duration_ms INTEGER,
|
||||||
|
energy FLOAT,
|
||||||
|
instrumentalness FLOAT,
|
||||||
|
key INTEGER,
|
||||||
|
liveness FLOAT,
|
||||||
|
loudness FLOAT,
|
||||||
|
speechiness FLOAT,
|
||||||
|
tempo FLOAT,
|
||||||
|
time_signature INTEGER,
|
||||||
|
valence FLOAT
|
||||||
);
|
);
|
||||||
''')
|
''')
|
||||||
|
|
||||||
@@ -79,7 +88,8 @@ class Database:
|
|||||||
album_id TEXT,
|
album_id TEXT,
|
||||||
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
|
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
|
||||||
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
|
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
|
||||||
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
|
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
|
||||||
|
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
|
||||||
);
|
);
|
||||||
''')
|
''')
|
||||||
|
|
||||||
|
|||||||
+4
-6
@@ -8,8 +8,7 @@ from spotify_api import get_multiple_field_information
|
|||||||
|
|
||||||
# Define the absolute folder path to the folder containing the gdrp retrieved data
|
# Define the absolute folder path to the folder containing the gdrp retrieved data
|
||||||
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
|
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
|
||||||
# Define the db
|
|
||||||
db = Database()
|
|
||||||
log = LoggerWrapper()
|
log = LoggerWrapper()
|
||||||
|
|
||||||
|
|
||||||
@@ -129,7 +128,7 @@ def _fill_missing_ids(all_songs_played, all_songs_catalogued):
|
|||||||
return all_songs_played
|
return all_songs_played
|
||||||
|
|
||||||
|
|
||||||
def _insert_data_into_db(all_songs_played: list):
|
def _insert_data_into_db(db: Database, all_songs_played: list):
|
||||||
"""
|
"""
|
||||||
This function takes a list of all played songs and inserts these into the database.
|
This function takes a list of all played songs and inserts these into the database.
|
||||||
|
|
||||||
@@ -142,10 +141,9 @@ def _insert_data_into_db(all_songs_played: list):
|
|||||||
log.error(f'Failed adding {entry} to database, error {e}')
|
log.error(f'Failed adding {entry} to database, error {e}')
|
||||||
|
|
||||||
|
|
||||||
def export_gdpr_data(n_limit: int = 100) -> None:
|
def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
|
||||||
all_songs_played = _read_gdrp_data()
|
all_songs_played = _read_gdrp_data()
|
||||||
all_songs_played = all_songs_played[-n_limit:]
|
all_songs_played = all_songs_played[-n_limit:]
|
||||||
all_songs_catalogued = _populate_ids(all_songs_played)
|
all_songs_catalogued = _populate_ids(all_songs_played)
|
||||||
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
|
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
|
||||||
_insert_data_into_db(all_songs_played)
|
_insert_data_into_db(db, all_songs_played)
|
||||||
db.close(__name__)
|
|
||||||
|
|||||||
+13
-5
@@ -1,9 +1,11 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import atexit
|
import atexit
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
|
from database_handler import Database
|
||||||
from gdpr_export import export_gdpr_data
|
from gdpr_export import export_gdpr_data
|
||||||
from logger import LoggerWrapper
|
from logger import LoggerWrapper
|
||||||
from scraper import scrape_missing_infos, scraping
|
from scraper import scrape_missing_infos, scraping
|
||||||
@@ -49,19 +51,25 @@ if args.verbose:
|
|||||||
log.set_console_handler_to_debug()
|
log.set_console_handler_to_debug()
|
||||||
log.info('Enabled verbose mode')
|
log.info('Enabled verbose mode')
|
||||||
|
|
||||||
|
db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
|
||||||
|
|
||||||
if args.export == 'TEST':
|
if args.export == 'TEST':
|
||||||
export_size = 200
|
export_size = 200
|
||||||
log.info(f'Scraping GDPR Data. Sample size: {export_size}')
|
log.info(f'Scraping GDPR Data. Sample size: {export_size}')
|
||||||
export_gdpr_data(export_size)
|
db = Database(db_path)
|
||||||
scrape_missing_infos()
|
export_gdpr_data(db, export_size)
|
||||||
|
scrape_missing_infos(db)
|
||||||
elif args.export == 'PRODUCTION':
|
elif args.export == 'PRODUCTION':
|
||||||
export_size = 1000000
|
export_size = 1000000
|
||||||
log.info('Scraping all GDPR Data.')
|
log.info('Scraping all GDPR Data.')
|
||||||
export_gdpr_data(export_size)
|
db = Database(db_path)
|
||||||
scrape_missing_infos()
|
export_gdpr_data(db, export_size)
|
||||||
|
scrape_missing_infos(db)
|
||||||
|
else:
|
||||||
|
raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
log.info('Scraping API...')
|
log.info('Scraping API...')
|
||||||
scraping()
|
scraping(db)
|
||||||
log.info('Done scraping API. Sleeping for 30 minutes...')
|
log.info('Done scraping API. Sleeping for 30 minutes...')
|
||||||
sleep(1800)
|
sleep(1800)
|
||||||
|
|||||||
+25
-17
@@ -3,12 +3,10 @@ from database_handler import Database, Table
|
|||||||
from logger import LoggerWrapper
|
from logger import LoggerWrapper
|
||||||
from spotify_api import get_last_played_track, get_multiple_field_information
|
from spotify_api import get_last_played_track, get_multiple_field_information
|
||||||
|
|
||||||
# Define DB
|
|
||||||
db = Database()
|
|
||||||
log = LoggerWrapper()
|
log = LoggerWrapper()
|
||||||
|
|
||||||
|
|
||||||
def scraping() -> None:
|
def scraping(db: Database) -> None:
|
||||||
"""
|
"""
|
||||||
This function is the main function that will be executed when the script is run
|
This function is the main function that will be executed when the script is run
|
||||||
"""
|
"""
|
||||||
@@ -16,15 +14,14 @@ def scraping() -> None:
|
|||||||
scope = "user-read-recently-played"
|
scope = "user-read-recently-played"
|
||||||
bearer_token = authenticate(scope)
|
bearer_token = authenticate(scope)
|
||||||
|
|
||||||
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
|
_read_recently_played_page_and_add_to_db(db, bearer_token)
|
||||||
scrape_missing_infos()
|
scrape_missing_infos(db)
|
||||||
|
|
||||||
|
|
||||||
def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
|
def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
|
||||||
"""
|
"""
|
||||||
This function gets a list of song play history and adds it into the database.
|
This function gets a list of song play history and adds it into the database.
|
||||||
"""
|
"""
|
||||||
global db
|
|
||||||
|
|
||||||
last_played_track = get_last_played_track(bearer_token=bearer_token)
|
last_played_track = get_last_played_track(bearer_token=bearer_token)
|
||||||
|
|
||||||
@@ -40,21 +37,24 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str) -> None:
|
|||||||
f"\nReturned Value: {last_played_track}")
|
f"\nReturned Value: {last_played_track}")
|
||||||
|
|
||||||
|
|
||||||
def scrape_missing_infos() -> None:
|
def scrape_missing_infos(db: Database) -> None:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
bearer_token_simple = simple_authenticate()
|
bearer_token_simple = simple_authenticate()
|
||||||
|
|
||||||
_process_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
|
_process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
|
||||||
_process_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
|
_process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
|
||||||
_process_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
|
_process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
|
||||||
|
# _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')
|
||||||
|
|
||||||
|
|
||||||
def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
|
def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
|
||||||
|
|
||||||
if endpoint_name == 'albums':
|
if endpoint_name == 'albums':
|
||||||
limit = 20
|
limit = 20
|
||||||
|
elif endpoint_name == 'audio-features':
|
||||||
|
limit = 100
|
||||||
else:
|
else:
|
||||||
limit = 50
|
limit = 50
|
||||||
|
|
||||||
@@ -82,19 +82,17 @@ def _process_missing_info(bearer_token_simple: str, table_name: Table, id_field_
|
|||||||
ids_tuple = tuple(ids)
|
ids_tuple = tuple(ids)
|
||||||
ids.clear()
|
ids.clear()
|
||||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||||
_add_data_to_database(table_name, response)
|
_add_data_to_database(db, table_name, response)
|
||||||
counter = 0
|
counter = 0
|
||||||
|
|
||||||
if len(ids) > 0:
|
if len(ids) > 0:
|
||||||
ids_tuple = tuple(ids)
|
ids_tuple = tuple(ids)
|
||||||
ids.clear()
|
ids.clear()
|
||||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||||
_add_data_to_database(table_name, response)
|
_add_data_to_database(db, table_name, response)
|
||||||
|
|
||||||
|
|
||||||
def _add_data_to_database(table_name: Table, response) -> None:
|
def _add_data_to_database(db: Database, table_name: Table, response) -> None:
|
||||||
|
|
||||||
global db
|
|
||||||
|
|
||||||
if table_name == Table.TRACK_INFORMATION:
|
if table_name == Table.TRACK_INFORMATION:
|
||||||
log.debug('Adding track information to database')
|
log.debug('Adding track information to database')
|
||||||
@@ -121,3 +119,13 @@ def _add_data_to_database(table_name: Table, response) -> None:
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
genre = ""
|
genre = ""
|
||||||
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
|
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
|
||||||
|
|
||||||
|
elif table_name == Table.TRACK_ATTRIBUTES:
|
||||||
|
log.debug('Adding track attributes to database')
|
||||||
|
for entry in response['audio_features']:
|
||||||
|
log.debug(f"Adding track attributes: {entry['id']}")
|
||||||
|
try:
|
||||||
|
db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
|
||||||
|
except Exception as e:
|
||||||
|
log.error(f"Failed to add track attributes to database: {e}"
|
||||||
|
f"\nReturned Value: {response}")
|
||||||
|
|||||||
Reference in New Issue
Block a user