diff --git a/.gitignore b/.gitignore index 0a19790..6e93763 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ +# Test running file +main_test.py + +# databases +*.db + +# Custom Tokens file/rotator +tokens.json + +# Visual Studio Code +.vscode/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -129,10 +141,9 @@ celerybeat.pid # Environments .env +!.env.example .venv -env/ venv/ -ENV/ env.bak/ venv.bak/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f748997 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +# .pre-commit-config.yaml + +repos: + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace # Remove trailing whitespace + exclude: '.*test.*' + files: \.(py)$ + + - id: end-of-file-fixer # Ensure a single newline at the end of a file + exclude: '.*test.*' + files: \.(py)$ + + - id: check-yaml # Check if the YAML files are valid + exclude: '.*test.*' + files: \.(yaml|yml)$ + + - id: check-json # Check if the JSON files are valid + exclude: '.*test.*' + files: \.(json)$ + + - id: check-added-large-files # Prevent large files from being committed + args: ['--maxkb=1000'] + + - id: check-ast # Check for parse errors in Python files + exclude: '.*test.*' + files: \.(py)$ + + - id: debug-statements # Check for print statements and pdb calls + exclude: '.*test.*' + files: \.(py)$ + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + args: ['--profile=black'] + files: \.(py)$ + exclude: '.*test.*' + + - repo: https://github.com/PyCQA/flake8 + rev: 7.1.1 + hooks: + - id: flake8 + args: ['--extend-ignore=E501,E402,W503,E721','--max-line-length=100'] + files: \.(py)$ + exclude: '.*test.*' diff --git a/README.md b/README.md index c934cae..f0a6c4a 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -# predictify \ No newline at end of file +# Predictify + +## Overview + +A Data analysis tool to scrape your Spotify History usage and let a ML-Model predict your next songs + +## Authentication API + +[Official Documentation](https://developer.spotify.com/documentation/web-api/tutorials/getting-started) +[Authorization Code Flow](https://developer.spotify.com/documentation/web-api/tutorials/code-flow) + +## Usable possible APIs + +Recently Played Tracks: /me/player/recently-played [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-recently-played) + +Get Track: /tracks/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-track) + +Get Track's Audio Features - Deprecated: /audio-features/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features) + +Get Track's Audio Analysis - Deprecated: /audio-analysis/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis) + +Get Artist: /artists/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-an-artist) + +## Authors + +[Chris Kiriakou](https://github.com/ckiri) +[Dominik Agres](https://github.com/agresdominik) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..46f840e --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +# pytest.ini +[pytest] +# Set the root directory to the current directory (.) +rootdir = . +pythonpath = . diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..89f3571 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +python-dotenv==1.0.1 +requests==2.32.3 +pre-commit==4.1.0 +pytest==8.3.5 +coverage==7.7.0 +pytest-cov==6.0.0 diff --git a/src/auth.py b/src/auth.py new file mode 100644 index 0000000..ab64646 --- /dev/null +++ b/src/auth.py @@ -0,0 +1,256 @@ +import base64 +import json +import os +import time +from http.server import BaseHTTPRequestHandler, HTTPServer +from urllib.parse import parse_qs, urlencode, urlparse + +import dotenv +import requests + +TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env', 'tokens.json') + + +def simple_authenticate(grant_type: str = "client_credentials") -> str: + """ + This function authenticates the user and returns the access token + + :return: str + """ + spotify_client_id, spotify_client_secret, spotify_redirect_uri = _read_env_file() + token_url = "https://accounts.spotify.com/api/token" + auth_value = f"{spotify_client_id}:{spotify_client_secret}" + auth_header = base64.b64encode(auth_value.encode('utf-8')).decode('utf-8') + + headers = { + "Authorization": f"Basic {auth_header}", + "Content-Type": "application/x-www-form-urlencoded" + } + + data = { + "grant_type": f"{grant_type}" + } + + response = requests.post(token_url, headers=headers, data=data) + + if response.status_code == 200: + access_token = response.json().get('access_token') + return access_token + else: + print(f"Error {response.status_code}: {response.text}") + + +def authenticate(scope: str) -> str: + """ + This function authenticates the user and returns the access token + + :param scope: str + :return: str + """ + spotify_client_id, spotify_client_secret, spotify_redirect_uri = _read_env_file() + + tokens = _load_tokens(scope) + if tokens: + access_token, refresh_token, expires_at = tokens + if time.time() < expires_at: + return access_token + else: + print(f"Token for scope {scope} expired, refreshing...") + access_token, expires_at = _refresh_access_token(refresh_token, spotify_client_id, spotify_client_secret) + _refresh_tokens_file(access_token, scope, expires_at) + return access_token + + auth_url = _get_authorization_url(spotify_client_id, spotify_redirect_uri, scope) + print(f'Please go to the following URL to authorize the app: {auth_url}') + + authorization_code = _start_server_and_wait_for_code() + + access_token, refresh_token, expires_at = _exchange_code_for_token(authorization_code, redirect_uri=spotify_redirect_uri, + client_id=spotify_client_id, client_secret=spotify_client_secret) + + _save_tokens(access_token, refresh_token, scope, expires_at) + + return access_token + + +def _get_authorization_url(client_id: str, redirect_uri: str, scope: str) -> str: + """ + This function generates the URL that the user needs to visit to authorize the app + + :param client_id: str + :param redirect_uri: str + :param scope: str + :return: str + """ + + auth_params = { + "response_type": "code", + "client_id": client_id, + "scope": scope, + "redirect_uri": redirect_uri, + "state": str(int(time.time())) + } + auth_url = "https://accounts.spotify.com/authorize?" + urlencode(auth_params) + return auth_url + + +def _read_env_file() -> tuple: + """ + This function reads the .env file and returns the client_id, client_secret and redirect_uri + + :return: tuple + """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + dotenv_folder_path = os.path.join(current_dir, 'env') + dotenv_path = os.path.join(dotenv_folder_path, '.env') + contents = dotenv.dotenv_values(dotenv_path=dotenv_path) + spotify_client_id = contents['SPOTIFY_CLIENT_ID'] + spotify_client_secret = contents['SPOTIFY_CLIENT_SECRET'] + spotify_redirect_uri = contents['SPOTIFY_REDIRECT_URI'] + return spotify_client_id, spotify_client_secret, spotify_redirect_uri + + +def _start_server_and_wait_for_code() -> any: + """ + This function starts a server and waits for the user to visit the authorization URL + and get the authorization code + + :return: any + """ + class CallbackHandler(BaseHTTPRequestHandler): + def do_GET(self): + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + if 'code' in query_params: + self.server.authorization_code = query_params['code'][0] + self.send_response(200) + self.end_headers() + self.wfile.write(b"Authorization successful! You can close this window.") + + server = HTTPServer(('localhost', 8888), CallbackHandler) + print("Starting server to capture the authorization code...") + server.handle_request() + return server.authorization_code + + +def _exchange_code_for_token(code: str, redirect_uri: str, client_id: str, client_secret: str) -> tuple: + """ + This function exchanges the authorization code for an access token + + :param code: str + :param redirect_uri: str + :param client_id: str + :param client_secret: str + :return: tuple + """ + + token_url = "https://accounts.spotify.com/api/token" + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + data = { + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': redirect_uri, + 'client_id': client_id, + 'client_secret': client_secret, + } + + response = requests.post(token_url, data=data, headers=headers) + response_data = response.json() + + if 'access_token' not in response_data: + raise Exception("Failed to get access token") + + access_token = response_data['access_token'] + refresh_token = response_data.get('refresh_token', None) + expires_in = response_data['expires_in'] + expires_at = time.time() + expires_in + return access_token, refresh_token, expires_at + + +def _refresh_access_token(refresh_token: str, client_id: str, client_secret: str) -> tuple: + """ + Refreshes the access token using the refresh token. + + :param refresh_token: str + :param client_id: str + :param client_secret: str + :return: tuple + """ + token_url = "https://accounts.spotify.com/api/token" + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': refresh_token, + 'client_id': client_id, + 'client_secret': client_secret, + } + + response = requests.post(token_url, data=data, headers=headers) + response_data = response.json() + + if 'access_token' not in response_data: + raise Exception("Failed to refresh access token") + + access_token = response_data['access_token'] + expires_in = response_data['expires_in'] + expires_at = time.time() + expires_in + return access_token, expires_at + + +def _load_tokens(scope: str) -> tuple: + """ + Loads the tokens from the local file if they exist and are still valid. + + :return: tuple or None + """ + if os.path.exists(TOKEN_FILE_PATH): + with open(TOKEN_FILE_PATH, 'r') as f: + tokens = json.load(f) + if scope in tokens: + if 'access_token' in tokens[scope] and 'expires_at' in tokens[scope] and 'expires_at' in tokens[scope]: + return tokens[scope]['access_token'], tokens[scope]['refresh_token'], tokens[scope]['expires_at'] + return None + + +def _save_tokens(access_token: str, refresh_token: str, scope: str, expires_at) -> None: + """ + Saves the access and refresh tokens to a local file. + + :param access_token: str + :param refresh_token: str + :param scope: str + """ + tokens = { + scope: { + 'access_token': access_token, + 'refresh_token': refresh_token, + 'expires_at': expires_at + }, + } + with open(TOKEN_FILE_PATH, 'w') as f: + json.dump(tokens, f) + + +def _refresh_tokens_file(access_token: str, scope: str, expires_at) -> None: + """ + Saves the access and refresh tokens to a local file. + + :param access_token: str + :param scope: str + """ + with open(TOKEN_FILE_PATH, 'r') as file: + tokens = json.load(file) + + if scope in tokens and 'refresh_token' in tokens[scope]: + tokens[scope]['access_token'] = access_token + tokens[scope]['expires_at'] = expires_at + with open(TOKEN_FILE_PATH, 'w') as file: + json.dump(tokens, file, indent=4) + else: + print(f"Error: Scope '{scope}' or refresh_token not found in the tokens file.") diff --git a/src/database_handler.py b/src/database_handler.py new file mode 100644 index 0000000..ce0478e --- /dev/null +++ b/src/database_handler.py @@ -0,0 +1,87 @@ +import sqlite3 +from enum import Enum + + +class Table(Enum): + TRACK_INFORMATION = "track_information" + ARTIST_INFORMATION = "artist_information" + ALBUM_INFORMATION = "album_information" + TRACK_ATTRIBUTES = "track_attributes" + RECENTLY_PLAYED = "recently_played" + + +class Database: + """ + A class to handle the database connection and operations + """ + + def __init__(self, db_name): + """Initialize the connection to the database""" + self.db_name = db_name + self.conn = sqlite3.connect(db_name) + self.cursor = self.conn.cursor() + self.create_tables() + + def create_tables(self): + """Create the tables in the database""" + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.TRACK_INFORMATION.value} ( + track_id TEXT PRIMARY KEY, + title TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.ARTIST_INFORMATION.value} ( + artist_id TEXT PRIMARY KEY, + artist_name TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.ALBUM_INFORMATION.value} ( + album_id TEXT PRIMARY KEY, + album_name TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} ( + track_id TEXT PRIMARY KEY, + attribute_name TEXT, + attribute_value TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.RECENTLY_PLAYED.value} ( + played_at TIMESTAMP PRIMARY KEY, + track_id TEXT, + artist_id TEXT, + album_id TEXT, + FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id), + FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id), + FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id) + ); + ''') + + # Commit the changes + self.conn.commit() + + def add_row(self, table: Table, values): + """Add a new row into the specified table""" + placeholders = ', '.join(['?'] * len(values)) + query = f"INSERT INTO {table.value} VALUES ({placeholders})" + self.cursor.execute(query, values) + self.conn.commit() + + def read_all_rows(self, table: Table, column: str = "*"): + """Read all rows from the specified table""" + self.cursor.execute(f"SELECT {column} FROM {table.value}") + rows = self.cursor.fetchall() + return rows + + def close(self): + """Close the database connection""" + self.conn.close() diff --git a/src/env/.env.example b/src/env/.env.example new file mode 100644 index 0000000..545e283 --- /dev/null +++ b/src/env/.env.example @@ -0,0 +1,3 @@ +SPOTIFY_CLIENT_ID=your_token_here +SPOTIFY_CLIENT_SECRET=your_token_here +SPOTIFY_REDIRECT_URI=http://localhost:8888/callback diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..e063310 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,85 @@ +import requests + +from auth import authenticate, simple_authenticate +from database_handler import Database, Table + +db = Database('spotify_scraped.db') + + +def main(): + """ + This function is the main function that will be executed when the script is run + """ + global db + + scope = "user-read-recently-played" + bearer_token = authenticate(scope) + + # Once each 30 mins + _read_recently_played_page_and_add_to_db(bearer_token=bearer_token) + + # Once a day + all_track_ids = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id') + bearer_toke_simple = simple_authenticate() + for track_id in all_track_ids: + response = _get_track_information(track_id=track_id[0], bearer_token=bearer_toke_simple) + print(response) + + # Close the database connection + db.close() + + +def _read_recently_played_page_and_add_to_db(bearer_token: str): + """ + """ + global db + + last_played_track = _get_last_played_track(bearer_token=bearer_token) + + for track in last_played_track['items']: + track_id = track['track']['id'] + played_at = track['played_at'] + album_id = track['track']['album']['id'] + artist_id = track['track']['artists'][0]['id'] + db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id)) + + +def _get_last_played_track(url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50", bearer_token: str = "") -> dict: + """ + This function returns the last played track based on the limit size + + :param limit: str + :param bearer_token: str + :return: dict + """ + + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def _get_track_information(track_id: str, bearer_token: str) -> dict: + """ + This function returns the track information based on the track id + + :param track_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/tracks/{track_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +if __name__ == '__main__': + main() diff --git a/test/test_auth.py b/test/test_auth.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_scraper.py b/test/test_scraper.py new file mode 100644 index 0000000..e69de29