From ff9d726b478b13671898769cca5a97192eae72df Mon Sep 17 00:00:00 2001 From: Dominik <48656177+agresdominik@users.noreply.github.com> Date: Sun, 23 Mar 2025 18:48:57 +0100 Subject: [PATCH] Feat/import gdrp data (#26) * Some simple code for extracting data from the jsons * Jupiter Notebook * Mac specific gitignore * Fixed finding paths to floders * Delete src/gdpr_data directory * Updated gitignore to include my testing file * Added the standard saving path for the database in the database handler, this way multiple files dont have to be updated when moving database position * Moved the API usage wrappers into an own file, added a function for getting multiple track_ids at once, this still needs to be tested more * Further code for extracting data from the gdpr files * Forgor * Final&Tested version of get_multiple_tracks_information endpoint * Further functionality: The code now extracts the id of each listened song and makes a api call to get info about these songs via the multiple tracks api. Furthermore we track the songs witch the call is made for already and skip these * Added function to map catalouged ids into the play history * Added args parser to runtime program, cleaned up some code * Fixed a bug where the database would always try to create tables, eaven if it exists * Added some small text for clean interface * Some final fixes to actual code, fixed db bug, reversed the order of database entries * Some documentation * Added -export args to docker runtime * fix --- .gitignore | 12 ++++ .pre-commit-config.yaml | 2 +- README.md | 4 ++ docker/startup.sh | 2 +- src/auth.py | 2 +- src/database_handler.py | 5 +- src/gdpr_analytics.ipynb | 96 ++++++++++++++++++++++++++ src/gdpr_export.py | 143 +++++++++++++++++++++++++++++++++++++++ src/runtime.py | 30 +++++++- src/scraper.py | 100 +++++---------------------- src/spotify_api.py | 106 +++++++++++++++++++++++++++++ 11 files changed, 412 insertions(+), 90 deletions(-) create mode 100644 src/gdpr_analytics.ipynb create mode 100644 src/gdpr_export.py create mode 100644 src/spotify_api.py diff --git a/.gitignore b/.gitignore index be40467..b1657d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ +# My testing file +main_test.py + +# .db +*.db + +# DS_Store +.DS_Store + +# Gdpr Data file +Streaming_History* + # Test running file main_test.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f748997..87fdf73 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,7 +22,7 @@ repos: files: \.(json)$ - id: check-added-large-files # Prevent large files from being committed - args: ['--maxkb=1000'] + args: ['--maxkb=2000'] - id: check-ast # Check for parse errors in Python files exclude: '.*test.*' diff --git a/README.md b/README.md index b6a4366..bac4f54 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,10 @@ docker run \ predictify:unstable ``` +## GDPR Data + +If you have gdpr data, create a folder: ```data/gdpr_data``` and add all .json files containing your play history into it. In order to extract it, run the script: ```python3 src/runtime.py --export``` + ## Authors [Chris Kiriakou](https://github.com/ckiri) diff --git a/docker/startup.sh b/docker/startup.sh index f6092ad..3211f35 100755 --- a/docker/startup.sh +++ b/docker/startup.sh @@ -2,4 +2,4 @@ # # Startup predictify. Don't use this. This is for docker specifically. source .venv/bin/activate -.venv/bin/python src/runtime.py +.venv/bin/python src/runtime.py --export diff --git a/src/auth.py b/src/auth.py index 57785aa..68be508 100644 --- a/src/auth.py +++ b/src/auth.py @@ -9,7 +9,7 @@ from urllib.parse import parse_qs, urlencode, urlparse import dotenv import requests -TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../data', 'tokens.json') +TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'tokens.json') def simple_authenticate(grant_type: str = "client_credentials") -> str: diff --git a/src/database_handler.py b/src/database_handler.py index 5a989dc..977bb8a 100644 --- a/src/database_handler.py +++ b/src/database_handler.py @@ -1,7 +1,10 @@ import logging as log +import os import sqlite3 from enum import Enum +DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db') + class Table(Enum): TRACK_INFORMATION = "track_information" @@ -16,7 +19,7 @@ class Database: A class to handle the database connection and operations """ - def __init__(self, db_name): + def __init__(self, db_name: str = DATABASE_PATH): """Initialize the connection to the database""" self.db_name = db_name self.conn = sqlite3.connect(db_name) diff --git a/src/gdpr_analytics.ipynb b/src/gdpr_analytics.ipynb new file mode 100644 index 0000000..17a813c --- /dev/null +++ b/src/gdpr_analytics.ipynb @@ -0,0 +1,96 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read out data from files" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from collections import defaultdict\n", + "\n", + "folder_path = os.path.join(os.getcwd(), '..', 'data', 'gdpr_data')\n", + "\n", + "play_list = []\n", + "\n", + "for filename in os.listdir(folder_path):\n", + "\n", + " if filename.endswith('.json'):\n", + " file_path = os.path.join(folder_path, filename)\n", + " \n", + " with open(file_path, 'r') as file:\n", + " data = json.load(file)\n", + "\n", + " for entry in data:\n", + " try:\n", + " track_id = entry['spotify_track_uri']\n", + " name = entry['master_metadata_track_name']\n", + " artist = entry['master_metadata_album_artist_name']\n", + " album = entry['master_metadata_album_album_name']\n", + " conn_country = entry['conn_country']\n", + " played_on = entry['ts']\n", + " played_track = {'track_id': track_id,\n", + " 'timestamp': played_on,\n", + " 'name': name,\n", + " 'album': album,\n", + " 'artist': artist,\n", + " 'played_from': conn_country\n", + " }\n", + " play_list.append(played_track)\n", + " \n", + " except Exception as e:\n", + " print(f'Missing field: {e}')\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Sort the playlist by timestamp\n", + "play_list.sort(key=lambda x: x['timestamp'])\n", + "\n", + "# Create a dictionary to store the number of times a track has been played\n", + "track_count = defaultdict(int)\n", + "\n", + "for track in play_list:\n", + " track_count[track['name'], track['artist']] += 1\n", + "\n", + "# Make Track count readable in Data wrangler\n", + "track_count = [{'track': k, 'count': v} for k, v in track_count.items()]\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/gdpr_export.py b/src/gdpr_export.py new file mode 100644 index 0000000..65c51f7 --- /dev/null +++ b/src/gdpr_export.py @@ -0,0 +1,143 @@ +import json +import logging as log +import os + +from auth import simple_authenticate +from database_handler import Database, Table +from spotify_api import get_multiple_tracks_information + +# Define the absolute folder path to the folder containing the gdrp retrieved data +folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data') +# Define the db +db = Database() + + +def _read_gdrp_data() -> list: + """ + This function reads all .json files in the folder containing the gdpr data. + This data is then extracted into a dict and sorted by timestamp ascending. + + :return: all_songs_played: A dict with an items field containing all songs played for the user + """ + all_songs_played = [] + + for filename in os.listdir(folder_path): + + if filename.endswith('.json'): + file_path = os.path.join(folder_path, filename) + + with open(file_path, 'r') as file: + data = json.load(file) + + for entry in data: + # This removes all podcasts from the list + if entry['spotify_track_uri'] is None: + continue + try: + track = { + 'timestamp': entry['ts'], + 'id': _extract_id(entry['spotify_track_uri']), + 'track_name': entry['master_metadata_track_name'], + 'artist_name': entry['master_metadata_album_artist_name'], + 'album_name': entry['master_metadata_album_album_name'], + 'conn_country': entry['conn_country'], + 'ms_played': entry['ms_played'] + } + all_songs_played.append(track) + except Exception as e: + print(f'Missing field: {e}') + + all_songs_played = sorted(all_songs_played, key=lambda x: x['timestamp']) + return all_songs_played + + +def _extract_id(spotify_id: str) -> str: + """ + This function gets a id with extra details and extracts the id from it. + + :param: id a string containing the id + :return: str the ID + """ + prefix = "spotify:track:" + prefix_removed_id = spotify_id[len(prefix):] + return prefix_removed_id + + +def _populate_ids(all_songs_played: list): + + track_ids = [] + all_songs_played_info = [] + token = simple_authenticate() + + processed_songs_id = set() + + for i, entry in enumerate(all_songs_played): + track_id = entry['id'] + + if track_id not in processed_songs_id: + track_ids.append(track_id) + processed_songs_id.add(track_id) + + if (i + 1) % 50 == 0: + track_ids_tuple = tuple(track_ids) + track_ids.clear() + response = get_multiple_tracks_information(token, *track_ids_tuple) + all_songs_played_info.extend(_sort_and_create_required_dataset(response)) + + if track_ids: + track_ids_tuple = tuple(track_ids) + response = get_multiple_tracks_information(token, *track_ids_tuple) + all_songs_played_info.extend(_sort_and_create_required_dataset(response)) + + return all_songs_played_info + + +def _sort_and_create_required_dataset(response) -> dict: + + track_list = [] + + for entry in response['tracks']: + track_data = { + 'track_id': entry['id'], + 'album_id': entry['album']['id'], + 'artist_id': entry['artists'][0]['id'] + } + track_list.append(track_data) + + return track_list + + +def _fill_missing_ids(all_songs_played, all_songs_catalogued): + + # Create a dictionary to map track_id to artist_id and album_id + track_id_to_artist_album = {data['track_id']: {'album_id': data['album_id'], 'artist_id': data['artist_id']} for data in all_songs_catalogued} + + # Now, we will update the original `tracks` list by adding artist_id and album_id + for track in all_songs_played: + track_info = track_id_to_artist_album.get(track['id']) + if track_info: + track['artist_id'] = track_info['artist_id'] + track['album_id'] = track_info['album_id'] + + return all_songs_played + + +def _insert_data_into_db(all_songs_played: list): + """ + This function takes a list of all played songs and inserts these into the database. + + :param: all_songs_played list of all songs + """ + for entry in all_songs_played: + try: + db.add_row(Table.RECENTLY_PLAYED, (entry['timestamp'], entry['id'], entry['artist_id'], entry['album_id'])) + except Exception as e: + log.error(f'Failed adding {entry} to database, error {e}') + + +def export_gdpr_data(n_limit: int = 100): + all_songs_played = _read_gdrp_data() + all_songs_played = all_songs_played[-n_limit:] + all_songs_catalogued = _populate_ids(all_songs_played) + all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued) + _insert_data_into_db(all_songs_played) diff --git a/src/runtime.py b/src/runtime.py index 9903d98..c69f97e 100644 --- a/src/runtime.py +++ b/src/runtime.py @@ -1,8 +1,34 @@ +import argparse from time import sleep -from scraper import scraping +from gdpr_export import export_gdpr_data +from scraper import scrape_missing_infos, scraping + +# Initialize the parser +parser = argparse.ArgumentParser(description="A python script written in Python3.13 which continuously checks what spotify songs " + "the user is listening to and logging these in a local database. \n" + "The Script also has a export function where it can read out the gdpr data exported by the user.") + +# Add optional arguments +parser.add_argument('--verbose', '-v', action='store_true', help="Enable verbose output") +parser.add_argument('--export', '-e', action='store_true', help="Export the gdpr data from spotify if not done already") + +# Parse the arguments +args = parser.parse_args() + +if args.verbose: + print('Enabled verbose mode') + # implement logger + +if args.export: + print('Scraping GDPR Data') + # The next function can gat a int witch defines the amount of songs witch will be scraped from the gdpr files. + # e.g. if 500 is input, the last 500 played songs will come up, if left empty, the last 100. + export_gdpr_data() + scrape_missing_infos() -# Run forever on intervals of 30 minutes while True: + print('Scraping API...') scraping() + print('Done Scraping') sleep(1800) diff --git a/src/scraper.py b/src/scraper.py index 3f57274..3d59a5b 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,9 +1,14 @@ -import requests - from auth import authenticate, simple_authenticate from database_handler import Database, Table +from spotify_api import ( + get_album_information, + get_artist_information, + get_last_played_track, + get_track_information, +) -db = Database('./data/spotify_scraped.db') +# Define DB +db = Database() def scraping(): @@ -17,19 +22,20 @@ def scraping(): # Once each 30 mins _read_recently_played_page_and_add_to_db(bearer_token=bearer_token) - _scrape_missing_infos() + scrape_missing_infos() db.close() def _read_recently_played_page_and_add_to_db(bearer_token: str): """ + This function gets a list of song play history and adds it into the database. """ global db - last_played_track = _get_last_played_track(bearer_token=bearer_token) + last_played_track = get_last_played_track(bearer_token=bearer_token) - for track in last_played_track['items']: + for track in reversed(last_played_track['items']): track_id = track['track']['id'] played_at = track['played_at'] album_id = track['track']['album']['id'] @@ -37,83 +43,9 @@ def _read_recently_played_page_and_add_to_db(bearer_token: str): db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id)) -def _get_last_played_track(url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50", bearer_token: str = "") -> dict: - """ - This function returns the last played track based on the limit size - - :param limit: str - :param bearer_token: str - :return: dict +def scrape_missing_infos(): """ - header = { - 'Authorization': f'Bearer {bearer_token}' - } - - response = requests.get(url, headers=header) - response_json = response.json() - return response_json - - -def _get_track_information(track_id: str, bearer_token: str) -> dict: - """ - This function returns the track information based on the track id - - :param track_id: str - :param bearer_token: str - :return: dict - """ - - url = f"https://api.spotify.com/v1/tracks/{track_id}" - header = { - 'Authorization': f'Bearer {bearer_token}' - } - - response = requests.get(url, headers=header) - response_json = response.json() - return response_json - - -def _get_artist_information(artist_id: str, bearer_token: str) -> dict: - """ - This function returns the artist information based on the artist id - - :param artist_id: str - :param bearer_token: str - :return: dict - """ - - url = f"https://api.spotify.com/v1/artists/{artist_id}" - header = { - 'Authorization': f'Bearer {bearer_token}' - } - - response = requests.get(url, headers=header) - response_json = response.json() - return response_json - - -def _get_album_information(album_id: str, bearer_token: str) -> dict: - """ - This function returns the album information based on the album id - - :param album_id: str - :param bearer_token: str - :return: dict - """ - - url = f"https://api.spotify.com/v1/albums/{album_id}" - header = { - 'Authorization': f'Bearer {bearer_token}' - } - - response = requests.get(url, headers=header) - response_json = response.json() - return response_json - - -def _scrape_missing_infos(): - """ """ global db @@ -124,14 +56,14 @@ def _scrape_missing_infos(): all_track_ids_saved = db.read_all_rows(Table.TRACK_INFORMATION, 'track_id') all_track_ids_missing = list(set(all_track_ids_recently_played) - set(all_track_ids_saved)) for track_id in all_track_ids_missing: - response = _get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple) + response = get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple) db.add_row(Table.TRACK_INFORMATION, (response['id'], response['name'], response['duration_ms'], response['explicit'], response['popularity'])) # Album Info all_album_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'album_id') all_album_ids_saved = db.read_all_rows(Table.ALBUM_INFORMATION, 'album_id') all_album_ids_missing = list(set(all_album_ids_recently_played) - set(all_album_ids_saved)) for album_id in all_album_ids_missing: - response = _get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple) + response = get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple) try: release_year = response['release_date'][:4] except Exception: @@ -142,7 +74,7 @@ def _scrape_missing_infos(): all_artist_ids_saved = db.read_all_rows(Table.ARTIST_INFORMATION, 'artist_id') all_artist_ids_missing = list(set(all_artist_ids_recently_played) - set(all_artist_ids_saved)) for artist_id in all_artist_ids_missing: - response = _get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple) + response = get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple) try: genre = response['genres'][0] except IndexError: diff --git a/src/spotify_api.py b/src/spotify_api.py new file mode 100644 index 0000000..98a9664 --- /dev/null +++ b/src/spotify_api.py @@ -0,0 +1,106 @@ +import logging as log + +import requests + + +def get_last_played_track(bearer_token: str, url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50") -> dict: + """ + This function returns the last played track based on the limit size + + :param limit: str + :param bearer_token: str + :return: dict + """ + + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def get_track_information(track_id: str, bearer_token: str) -> dict: + """ + This function returns the track information based on the track id + + :param track_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/tracks/{track_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def get_multiple_tracks_information(bearer_token: str, *track_ids) -> dict: + """ + This function returns the track information based on the track id + + :param *track_id: str + :param bearer_token: str + :return: dict + """ + if len(track_ids) > 50: + log.error('Passed more than 50 track ids to get_multiple_tracks_information') + return None + + url_suffix = "ids=" + separator = "," + for track_id in track_ids: + url_suffix = url_suffix + track_id + separator + + url = f"https://api.spotify.com/v1/tracks?{url_suffix}" + url = url[:-len(separator)] + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def get_artist_information(artist_id: str, bearer_token: str) -> dict: + """ + This function returns the artist information based on the artist id + + :param artist_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/artists/{artist_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def get_album_information(album_id: str, bearer_token: str) -> dict: + """ + This function returns the album information based on the album id + + :param album_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/albums/{album_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json