Feat/import gdrp data (#26)

* Some simple code for extracting data from the jsons * Jupiter Notebook * Mac specific gitignore * Fixed finding paths to floders * Delete src/gdpr_data directory * Updated gitignore to include my testing file * Added the standard saving path for the database in the database handler, this way multiple files dont have to be updated when moving database position * Moved the API usage wrappers into an own file, added a function for getting multiple track_ids at once, this still needs to be tested more * Further code for extracting data from the gdpr files * Forgor * Final&Tested version of get_multiple_tracks_information endpoint * Further functionality: The code now extracts the id of each listened song and makes a api call to get info about these songs via the multiple tracks api. Furthermore we track the songs witch the call is made for already and skip these * Added function to map catalouged ids into the play history * Added args parser to runtime program, cleaned up some code * Fixed a bug where the database would always try to create tables, eaven if it exists * Added some small text for clean interface * Some final fixes to actual code, fixed db bug, reversed the order of database entries * Some documentation * Added -export args to docker runtime * fix
2026-04-21 17:55:49 +00:00 · 2025-03-23 18:48:57 +01:00
parent 535225392f
commit ff9d726b47
11 changed files with 412 additions and 90 deletions
@@ -0,0 +1,143 @@
+import json
+import logging as log
+import os
+
+from auth import simple_authenticate
+from database_handler import Database, Table
+from spotify_api import get_multiple_tracks_information
+
+# Define the absolute folder path to the folder containing the gdrp retrieved data
+folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
+# Define the db
+db = Database()
+
+
+def _read_gdrp_data() -> list:
+    """
+    This function reads all .json files in the folder containing the gdpr data.
+    This data is then extracted into a dict and sorted by timestamp ascending.
+
+    :return: all_songs_played: A dict with an items field containing all songs played for the user
+    """
+    all_songs_played = []
+
+    for filename in os.listdir(folder_path):
+
+        if filename.endswith('.json'):
+            file_path = os.path.join(folder_path, filename)
+
+            with open(file_path, 'r') as file:
+                data = json.load(file)
+
+                for entry in data:
+                    # This removes all podcasts from the list
+                    if entry['spotify_track_uri'] is None:
+                        continue
+                    try:
+                        track = {
+                            'timestamp': entry['ts'],
+                            'id': _extract_id(entry['spotify_track_uri']),
+                            'track_name': entry['master_metadata_track_name'],
+                            'artist_name': entry['master_metadata_album_artist_name'],
+                            'album_name': entry['master_metadata_album_album_name'],
+                            'conn_country': entry['conn_country'],
+                            'ms_played': entry['ms_played']
+                            }
+                        all_songs_played.append(track)
+                    except Exception as e:
+                        print(f'Missing field: {e}')
+
+    all_songs_played = sorted(all_songs_played, key=lambda x: x['timestamp'])
+    return all_songs_played
+
+
+def _extract_id(spotify_id: str) -> str:
+    """
+    This function gets a id with extra details and extracts the id from it.
+
+    :param: id a string containing the id
+    :return: str the ID
+    """
+    prefix = "spotify:track:"
+    prefix_removed_id = spotify_id[len(prefix):]
+    return prefix_removed_id
+
+
+def _populate_ids(all_songs_played: list):
+
+    track_ids = []
+    all_songs_played_info = []
+    token = simple_authenticate()
+
+    processed_songs_id = set()
+
+    for i, entry in enumerate(all_songs_played):
+        track_id = entry['id']
+
+        if track_id not in processed_songs_id:
+            track_ids.append(track_id)
+            processed_songs_id.add(track_id)
+
+        if (i + 1) % 50 == 0:
+            track_ids_tuple = tuple(track_ids)
+            track_ids.clear()
+            response = get_multiple_tracks_information(token, *track_ids_tuple)
+            all_songs_played_info.extend(_sort_and_create_required_dataset(response))
+
+    if track_ids:
+        track_ids_tuple = tuple(track_ids)
+        response = get_multiple_tracks_information(token, *track_ids_tuple)
+        all_songs_played_info.extend(_sort_and_create_required_dataset(response))
+
+    return all_songs_played_info
+
+
+def _sort_and_create_required_dataset(response) -> dict:
+
+    track_list = []
+
+    for entry in response['tracks']:
+        track_data = {
+            'track_id': entry['id'],
+            'album_id': entry['album']['id'],
+            'artist_id': entry['artists'][0]['id']
+        }
+        track_list.append(track_data)
+
+    return track_list
+
+
+def _fill_missing_ids(all_songs_played, all_songs_catalogued):
+
+    # Create a dictionary to map track_id to artist_id and album_id
+    track_id_to_artist_album = {data['track_id']: {'album_id': data['album_id'], 'artist_id': data['artist_id']} for data in all_songs_catalogued}
+
+    # Now, we will update the original `tracks` list by adding artist_id and album_id
+    for track in all_songs_played:
+        track_info = track_id_to_artist_album.get(track['id'])
+        if track_info:
+            track['artist_id'] = track_info['artist_id']
+            track['album_id'] = track_info['album_id']
+
+    return all_songs_played
+
+
+def _insert_data_into_db(all_songs_played: list):
+    """
+    This function takes a list of all played songs and inserts these into the database.
+
+    :param: all_songs_played list of all songs
+    """
+    for entry in all_songs_played:
+        try:
+            db.add_row(Table.RECENTLY_PLAYED, (entry['timestamp'], entry['id'], entry['artist_id'], entry['album_id']))
+        except Exception as e:
+            log.error(f'Failed adding {entry} to database, error {e}')
+
+
+def export_gdpr_data(n_limit: int = 100):
+    all_songs_played = _read_gdrp_data()
+    all_songs_played = all_songs_played[-n_limit:]
+    all_songs_catalogued = _populate_ids(all_songs_played)
+    all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
+    _insert_data_into_db(all_songs_played)