From af6b3dba13c7595b512769715ba02e501748d25a Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 20:49:00 +0100 Subject: [PATCH 1/6] The most convoluted, code-efficient, bloated, boilerplated, duplicated unneccecary undocumented, soon to be regretted code --- src/scraper.py | 104 ++++++++++++++++++++++++++++----------------- src/spotify_api.py | 29 +++++++++++++ 2 files changed, 93 insertions(+), 40 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index 3d59a5b..e488e35 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,11 +1,6 @@ from auth import authenticate, simple_authenticate from database_handler import Database, Table -from spotify_api import ( - get_album_information, - get_artist_information, - get_last_played_track, - get_track_information, -) +from spotify_api import get_last_played_track, get_multiple_field_information # Define DB db = Database() @@ -20,12 +15,9 @@ def scraping(): scope = "user-read-recently-played" bearer_token = authenticate(scope) - # Once each 30 mins _read_recently_played_page_and_add_to_db(bearer_token=bearer_token) scrape_missing_infos() - db.close() - def _read_recently_played_page_and_add_to_db(bearer_token: str): """ @@ -47,36 +39,68 @@ def scrape_missing_infos(): """ """ - global db - bearer_token_simple = simple_authenticate() - # Track Info - all_track_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id') - all_track_ids_saved = db.read_all_rows(Table.TRACK_INFORMATION, 'track_id') - all_track_ids_missing = list(set(all_track_ids_recently_played) - set(all_track_ids_saved)) - for track_id in all_track_ids_missing: - response = get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple) - db.add_row(Table.TRACK_INFORMATION, (response['id'], response['name'], response['duration_ms'], response['explicit'], response['popularity'])) - # Album Info - all_album_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'album_id') - all_album_ids_saved = db.read_all_rows(Table.ALBUM_INFORMATION, 'album_id') - all_album_ids_missing = list(set(all_album_ids_recently_played) - set(all_album_ids_saved)) - for album_id in all_album_ids_missing: - response = get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple) - try: - release_year = response['release_date'][:4] - except Exception: - release_year = "" - db.add_row(Table.ALBUM_INFORMATION, (response['id'], response['name'], response['album_type'], response['total_tracks'], release_year, response['label'])) - # Artist Info - all_artist_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'artist_id') - all_artist_ids_saved = db.read_all_rows(Table.ARTIST_INFORMATION, 'artist_id') - all_artist_ids_missing = list(set(all_artist_ids_recently_played) - set(all_artist_ids_saved)) - for artist_id in all_artist_ids_missing: - response = get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple) - try: - genre = response['genres'][0] - except IndexError: - genre = "" - db.add_row(Table.ARTIST_INFORMATION, (response['id'], response['name'], response['followers']['total'], genre, response['popularity'])) + _scrape_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks') + _scrape_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums') + _scrape_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists') + + +def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str): + + if endpoint_name == 'albums': + limit = 20 + else: + limit = 50 + + all_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, id_field_name) + all_ids_saved = db.read_all_rows(table_name, id_field_name) + all_ids_missing = list(set(all_ids_recently_played) - set(all_ids_saved)) + + ids = [] + processed_ids = set() + + for i, id_value in enumerate(all_ids_missing): + + id_value_str = id_value[0] + + if id_value_str not in processed_ids: + ids.append(id_value_str) + processed_ids.add(id_value_str) + + if (i + 1) % limit == 0: + ids_tuple = tuple(ids) + ids.clear() + response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) + _add_data_to_database(table_name, response) + + if ids: + ids_tuple = tuple(ids) + ids.clear() + response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) + _add_data_to_database(table_name, response) + + +def _add_data_to_database(table_name: Table, response): + + global db + + if table_name == Table.TRACK_INFORMATION: + for entry in response['tracks']: + db.add_row(table_name, (entry['id'], entry['name'], entry['duration_ms'], entry['explicit'], entry['popularity'])) + + elif table_name == Table.ALBUM_INFORMATION: + for entry in response['albums']: + try: + release_year = entry['release_date'][:4] + except Exception: + release_year = "" + db.add_row(table_name, (entry['id'], entry['name'], entry['album_type'], entry['total_tracks'], release_year, entry['label'])) + + elif table_name == Table.ARTIST_INFORMATION: + for entry in response['artists']: + try: + genre = entry['genres'][0] + except IndexError: + genre = "" + db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity'])) diff --git a/src/spotify_api.py b/src/spotify_api.py index 98a9664..211fbf7 100644 --- a/src/spotify_api.py +++ b/src/spotify_api.py @@ -104,3 +104,32 @@ def get_album_information(album_id: str, bearer_token: str) -> dict: response = requests.get(url, headers=header) response_json = response.json() return response_json + + +def get_multiple_field_information(bearer_token: str, api_type: str, limit: int, *track_ids) -> dict: + """ + This function returns the track information based on the track id + + :param *track_id: str + :param bearer_token: str + :return: dict + """ + + if len(track_ids) > limit: + log.error('Passed more than 20/50 ids to get_multiple_field_information') + return None + + url_suffix = "ids=" + separator = "," + for track_id in track_ids: + url_suffix = url_suffix + track_id + separator + + url = f"https://api.spotify.com/v1/{api_type}?{url_suffix}" + url = url[:-len(separator)] + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json From 164ea9aca9c9d412e669154cc0803fead9c0b4e1 Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 20:49:34 +0100 Subject: [PATCH 2/6] 'fix' for a bug --- src/runtime.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/runtime.py b/src/runtime.py index c69f97e..46855b8 100644 --- a/src/runtime.py +++ b/src/runtime.py @@ -1,9 +1,12 @@ import argparse from time import sleep +from database_handler import Database from gdpr_export import export_gdpr_data from scraper import scrape_missing_infos, scraping +db = Database() + # Initialize the parser parser = argparse.ArgumentParser(description="A python script written in Python3.13 which continuously checks what spotify songs " "the user is listening to and logging these in a local database. \n" @@ -32,3 +35,7 @@ while True: scraping() print('Done Scraping') sleep(1800) + + +# TODO: Trap this: +db.close() From e73752c0152b18ae9081bf591b78b7e596454e9f Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 21:54:44 +0100 Subject: [PATCH 3/6] Fixed counter issue where the accumilation would be calculated wrongly and the requests were not at the fullest capacity --- src/scraper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/scraper.py b/src/scraper.py index e488e35..489b72f 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -60,21 +60,25 @@ def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_n ids = [] processed_ids = set() - for i, id_value in enumerate(all_ids_missing): + counter = 0 + + for id_value in all_ids_missing: id_value_str = id_value[0] if id_value_str not in processed_ids: ids.append(id_value_str) processed_ids.add(id_value_str) + counter += 1 - if (i + 1) % limit == 0: + if (counter + 1) % limit == 0 and len(ids) > 0: ids_tuple = tuple(ids) ids.clear() response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) _add_data_to_database(table_name, response) + counter = 0 - if ids: + if len(ids) > 0: ids_tuple = tuple(ids) ids.clear() response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple) From 7552d437022a6b25003df24c36fab6ba6d06e6e8 Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 21:55:11 +0100 Subject: [PATCH 4/6] Bigger batch size --- src/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime.py b/src/runtime.py index 46855b8..ee73e83 100644 --- a/src/runtime.py +++ b/src/runtime.py @@ -27,7 +27,7 @@ if args.export: print('Scraping GDPR Data') # The next function can gat a int witch defines the amount of songs witch will be scraped from the gdpr files. # e.g. if 500 is input, the last 500 played songs will come up, if left empty, the last 100. - export_gdpr_data() + export_gdpr_data(1000000) scrape_missing_infos() while True: From 9d48f7bef7088cc2778e7a96657b6e28ac59c7bc Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 21:55:38 +0100 Subject: [PATCH 5/6] Fixed counter bug --- src/gdpr_export.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/gdpr_export.py b/src/gdpr_export.py index 65c51f7..319a20c 100644 --- a/src/gdpr_export.py +++ b/src/gdpr_export.py @@ -4,7 +4,7 @@ import os from auth import simple_authenticate from database_handler import Database, Table -from spotify_api import get_multiple_tracks_information +from spotify_api import get_multiple_field_information # Define the absolute folder path to the folder containing the gdrp retrieved data folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data') @@ -71,22 +71,26 @@ def _populate_ids(all_songs_played: list): processed_songs_id = set() - for i, entry in enumerate(all_songs_played): + counter = 0 + + for entry in all_songs_played: track_id = entry['id'] if track_id not in processed_songs_id: track_ids.append(track_id) processed_songs_id.add(track_id) + counter += 1 - if (i + 1) % 50 == 0: + if (counter + 1) % 50 == 0 and len(track_ids) > 0: track_ids_tuple = tuple(track_ids) track_ids.clear() - response = get_multiple_tracks_information(token, *track_ids_tuple) + response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple) all_songs_played_info.extend(_sort_and_create_required_dataset(response)) + counter = 0 - if track_ids: + if len(track_ids) > 0: track_ids_tuple = tuple(track_ids) - response = get_multiple_tracks_information(token, *track_ids_tuple) + response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple) all_songs_played_info.extend(_sort_and_create_required_dataset(response)) return all_songs_played_info From c65a1a8c8b5d877e00c5a14eb249441593b5e79c Mon Sep 17 00:00:00 2001 From: agres Date: Sun, 23 Mar 2025 21:56:09 +0100 Subject: [PATCH 6/6] Unified spotify api request into one function --- src/spotify_api.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/spotify_api.py b/src/spotify_api.py index 211fbf7..b8a2f91 100644 --- a/src/spotify_api.py +++ b/src/spotify_api.py @@ -40,34 +40,6 @@ def get_track_information(track_id: str, bearer_token: str) -> dict: return response_json -def get_multiple_tracks_information(bearer_token: str, *track_ids) -> dict: - """ - This function returns the track information based on the track id - - :param *track_id: str - :param bearer_token: str - :return: dict - """ - if len(track_ids) > 50: - log.error('Passed more than 50 track ids to get_multiple_tracks_information') - return None - - url_suffix = "ids=" - separator = "," - for track_id in track_ids: - url_suffix = url_suffix + track_id + separator - - url = f"https://api.spotify.com/v1/tracks?{url_suffix}" - url = url[:-len(separator)] - header = { - 'Authorization': f'Bearer {bearer_token}' - } - - response = requests.get(url, headers=header) - response_json = response.json() - return response_json - - def get_artist_information(artist_id: str, bearer_token: str) -> dict: """ This function returns the artist information based on the artist id