From af6b3dba13c7595b512769715ba02e501748d25a Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 20:49:00 +0100
Subject: [PATCH 1/6] The most convoluted, code-efficient, bloated,
 boilerplated, duplicated unneccecary undocumented, soon to be regretted code

---
 src/scraper.py     | 104 ++++++++++++++++++++++++++++-----------------
 src/spotify_api.py |  29 +++++++++++++
 2 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/src/scraper.py b/src/scraper.py
index 3d59a5b..e488e35 100644
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -1,11 +1,6 @@
 from auth import authenticate, simple_authenticate
 from database_handler import Database, Table
-from spotify_api import (
-    get_album_information,
-    get_artist_information,
-    get_last_played_track,
-    get_track_information,
-)
+from spotify_api import get_last_played_track, get_multiple_field_information
 
 # Define DB
 db = Database()
@@ -20,12 +15,9 @@ def scraping():
     scope = "user-read-recently-played"
     bearer_token = authenticate(scope)
 
-    # Once each 30 mins
     _read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
     scrape_missing_infos()
 
-    db.close()
-
 
 def _read_recently_played_page_and_add_to_db(bearer_token: str):
     """
@@ -47,36 +39,68 @@ def scrape_missing_infos():
     """
 
     """
-    global db
-
     bearer_token_simple = simple_authenticate()
 
-    # Track Info
-    all_track_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id')
-    all_track_ids_saved = db.read_all_rows(Table.TRACK_INFORMATION, 'track_id')
-    all_track_ids_missing = list(set(all_track_ids_recently_played) - set(all_track_ids_saved))
-    for track_id in all_track_ids_missing:
-        response = get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple)
-        db.add_row(Table.TRACK_INFORMATION, (response['id'], response['name'], response['duration_ms'], response['explicit'], response['popularity']))
-    # Album Info
-    all_album_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'album_id')
-    all_album_ids_saved = db.read_all_rows(Table.ALBUM_INFORMATION, 'album_id')
-    all_album_ids_missing = list(set(all_album_ids_recently_played) - set(all_album_ids_saved))
-    for album_id in all_album_ids_missing:
-        response = get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple)
-        try:
-            release_year = response['release_date'][:4]
-        except Exception:
-            release_year = ""
-        db.add_row(Table.ALBUM_INFORMATION, (response['id'], response['name'], response['album_type'], response['total_tracks'], release_year, response['label']))
-    # Artist Info
-    all_artist_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'artist_id')
-    all_artist_ids_saved = db.read_all_rows(Table.ARTIST_INFORMATION, 'artist_id')
-    all_artist_ids_missing = list(set(all_artist_ids_recently_played) - set(all_artist_ids_saved))
-    for artist_id in all_artist_ids_missing:
-        response = get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple)
-        try:
-            genre = response['genres'][0]
-        except IndexError:
-            genre = ""
-        db.add_row(Table.ARTIST_INFORMATION, (response['id'], response['name'], response['followers']['total'], genre, response['popularity']))
+    _scrape_missing_info(bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
+    _scrape_missing_info(bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
+    _scrape_missing_info(bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
+
+
+def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str):
+
+    if endpoint_name == 'albums':
+        limit = 20
+    else:
+        limit = 50
+
+    all_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, id_field_name)
+    all_ids_saved = db.read_all_rows(table_name, id_field_name)
+    all_ids_missing = list(set(all_ids_recently_played) - set(all_ids_saved))
+
+    ids = []
+    processed_ids = set()
+
+    for i, id_value in enumerate(all_ids_missing):
+
+        id_value_str = id_value[0]
+
+        if id_value_str not in processed_ids:
+            ids.append(id_value_str)
+            processed_ids.add(id_value_str)
+
+        if (i + 1) % limit == 0:
+            ids_tuple = tuple(ids)
+            ids.clear()
+            response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
+            _add_data_to_database(table_name, response)
+
+    if ids:
+        ids_tuple = tuple(ids)
+        ids.clear()
+        response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
+        _add_data_to_database(table_name, response)
+
+
+def _add_data_to_database(table_name: Table, response):
+
+    global db
+
+    if table_name == Table.TRACK_INFORMATION:
+        for entry in response['tracks']:
+            db.add_row(table_name, (entry['id'], entry['name'], entry['duration_ms'], entry['explicit'], entry['popularity']))
+
+    elif table_name == Table.ALBUM_INFORMATION:
+        for entry in response['albums']:
+            try:
+                release_year = entry['release_date'][:4]
+            except Exception:
+                release_year = ""
+            db.add_row(table_name, (entry['id'], entry['name'], entry['album_type'], entry['total_tracks'], release_year, entry['label']))
+
+    elif table_name == Table.ARTIST_INFORMATION:
+        for entry in response['artists']:
+            try:
+                genre = entry['genres'][0]
+            except IndexError:
+                genre = ""
+            db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
diff --git a/src/spotify_api.py b/src/spotify_api.py
index 98a9664..211fbf7 100644
--- a/src/spotify_api.py
+++ b/src/spotify_api.py
@@ -104,3 +104,32 @@ def get_album_information(album_id: str, bearer_token: str) -> dict:
     response = requests.get(url, headers=header)
     response_json = response.json()
     return response_json
+
+
+def get_multiple_field_information(bearer_token: str, api_type: str, limit: int,  *track_ids) -> dict:
+    """
+    This function returns the track information based on the track id
+
+    :param *track_id: str
+    :param bearer_token: str
+    :return: dict
+    """
+
+    if len(track_ids) > limit:
+        log.error('Passed more than 20/50 ids to get_multiple_field_information')
+        return None
+
+    url_suffix = "ids="
+    separator = ","
+    for track_id in track_ids:
+        url_suffix = url_suffix + track_id + separator
+
+    url = f"https://api.spotify.com/v1/{api_type}?{url_suffix}"
+    url = url[:-len(separator)]
+    header = {
+        'Authorization': f'Bearer {bearer_token}'
+    }
+
+    response = requests.get(url, headers=header)
+    response_json = response.json()
+    return response_json

From 164ea9aca9c9d412e669154cc0803fead9c0b4e1 Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 20:49:34 +0100
Subject: [PATCH 2/6] 'fix' for a bug

---
 src/runtime.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/runtime.py b/src/runtime.py
index c69f97e..46855b8 100644
--- a/src/runtime.py
+++ b/src/runtime.py
@@ -1,9 +1,12 @@
 import argparse
 from time import sleep
 
+from database_handler import Database
 from gdpr_export import export_gdpr_data
 from scraper import scrape_missing_infos, scraping
 
+db = Database()
+
 # Initialize the parser
 parser = argparse.ArgumentParser(description="A python script written in Python3.13 which continuously checks what spotify songs "
                                              "the user is listening to and logging these in a local database. \n"
@@ -32,3 +35,7 @@ while True:
     scraping()
     print('Done Scraping')
     sleep(1800)
+
+
+# TODO: Trap this:
+db.close()

From e73752c0152b18ae9081bf591b78b7e596454e9f Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 21:54:44 +0100
Subject: [PATCH 3/6] Fixed counter issue where the accumilation would be
 calculated wrongly and the requests were not at the fullest capacity

---
 src/scraper.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/scraper.py b/src/scraper.py
index e488e35..489b72f 100644
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -60,21 +60,25 @@ def _scrape_missing_info(bearer_token_simple: str, table_name: Table, id_field_n
     ids = []
     processed_ids = set()
 
-    for i, id_value in enumerate(all_ids_missing):
+    counter = 0
+
+    for id_value in all_ids_missing:
 
         id_value_str = id_value[0]
 
         if id_value_str not in processed_ids:
             ids.append(id_value_str)
             processed_ids.add(id_value_str)
+            counter += 1
 
-        if (i + 1) % limit == 0:
+        if (counter + 1) % limit == 0 and len(ids) > 0:
             ids_tuple = tuple(ids)
             ids.clear()
             response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
             _add_data_to_database(table_name, response)
+            counter = 0
 
-    if ids:
+    if len(ids) > 0:
         ids_tuple = tuple(ids)
         ids.clear()
         response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)

From 7552d437022a6b25003df24c36fab6ba6d06e6e8 Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 21:55:11 +0100
Subject: [PATCH 4/6] Bigger batch size

---
 src/runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime.py b/src/runtime.py
index 46855b8..ee73e83 100644
--- a/src/runtime.py
+++ b/src/runtime.py
@@ -27,7 +27,7 @@ if args.export:
     print('Scraping GDPR Data')
     # The next function can gat a int witch defines the amount of songs witch will be scraped from the gdpr files.
     # e.g. if 500 is input, the last 500 played songs will come up, if left empty, the last 100.
-    export_gdpr_data()
+    export_gdpr_data(1000000)
     scrape_missing_infos()
 
 while True:

From 9d48f7bef7088cc2778e7a96657b6e28ac59c7bc Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 21:55:38 +0100
Subject: [PATCH 5/6] Fixed counter bug

---
 src/gdpr_export.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/gdpr_export.py b/src/gdpr_export.py
index 65c51f7..319a20c 100644
--- a/src/gdpr_export.py
+++ b/src/gdpr_export.py
@@ -4,7 +4,7 @@ import os
 
 from auth import simple_authenticate
 from database_handler import Database, Table
-from spotify_api import get_multiple_tracks_information
+from spotify_api import get_multiple_field_information
 
 # Define the absolute folder path to the folder containing the gdrp retrieved data
 folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
@@ -71,22 +71,26 @@ def _populate_ids(all_songs_played: list):
 
     processed_songs_id = set()
 
-    for i, entry in enumerate(all_songs_played):
+    counter = 0
+
+    for entry in all_songs_played:
         track_id = entry['id']
 
         if track_id not in processed_songs_id:
             track_ids.append(track_id)
             processed_songs_id.add(track_id)
+            counter += 1
 
-        if (i + 1) % 50 == 0:
+        if (counter + 1) % 50 == 0 and len(track_ids) > 0:
             track_ids_tuple = tuple(track_ids)
             track_ids.clear()
-            response = get_multiple_tracks_information(token, *track_ids_tuple)
+            response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple)
             all_songs_played_info.extend(_sort_and_create_required_dataset(response))
+            counter = 0
 
-    if track_ids:
+    if len(track_ids) > 0:
         track_ids_tuple = tuple(track_ids)
-        response = get_multiple_tracks_information(token, *track_ids_tuple)
+        response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple)
         all_songs_played_info.extend(_sort_and_create_required_dataset(response))
 
     return all_songs_played_info

From c65a1a8c8b5d877e00c5a14eb249441593b5e79c Mon Sep 17 00:00:00 2001
From: agres <agres.dominik@gmail.com>
Date: Sun, 23 Mar 2025 21:56:09 +0100
Subject: [PATCH 6/6] Unified spotify api request into one function

---
 src/spotify_api.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/spotify_api.py b/src/spotify_api.py
index 211fbf7..b8a2f91 100644
--- a/src/spotify_api.py
+++ b/src/spotify_api.py
@@ -40,34 +40,6 @@ def get_track_information(track_id: str, bearer_token: str) -> dict:
     return response_json
 
 
-def get_multiple_tracks_information(bearer_token: str, *track_ids) -> dict:
-    """
-    This function returns the track information based on the track id
-
-    :param *track_id: str
-    :param bearer_token: str
-    :return: dict
-    """
-    if len(track_ids) > 50:
-        log.error('Passed more than 50 track ids to get_multiple_tracks_information')
-        return None
-
-    url_suffix = "ids="
-    separator = ","
-    for track_id in track_ids:
-        url_suffix = url_suffix + track_id + separator
-
-    url = f"https://api.spotify.com/v1/tracks?{url_suffix}"
-    url = url[:-len(separator)]
-    header = {
-        'Authorization': f'Bearer {bearer_token}'
-    }
-
-    response = requests.get(url, headers=header)
-    response_json = response.json()
-    return response_json
-
-
 def get_artist_information(artist_id: str, bearer_token: str) -> dict:
     """
     This function returns the artist information based on the artist id