diff --git a/.gitignore b/.gitignore index 0a19790..6e93763 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ +# Test running file +main_test.py + +# databases +*.db + +# Custom Tokens file/rotator +tokens.json + +# Visual Studio Code +.vscode/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -129,10 +141,9 @@ celerybeat.pid # Environments .env +!.env.example .venv -env/ venv/ -ENV/ env.bak/ venv.bak/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..f748997 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +# .pre-commit-config.yaml + +repos: + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace # Remove trailing whitespace + exclude: '.*test.*' + files: \.(py)$ + + - id: end-of-file-fixer # Ensure a single newline at the end of a file + exclude: '.*test.*' + files: \.(py)$ + + - id: check-yaml # Check if the YAML files are valid + exclude: '.*test.*' + files: \.(yaml|yml)$ + + - id: check-json # Check if the JSON files are valid + exclude: '.*test.*' + files: \.(json)$ + + - id: check-added-large-files # Prevent large files from being committed + args: ['--maxkb=1000'] + + - id: check-ast # Check for parse errors in Python files + exclude: '.*test.*' + files: \.(py)$ + + - id: debug-statements # Check for print statements and pdb calls + exclude: '.*test.*' + files: \.(py)$ + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + args: ['--profile=black'] + files: \.(py)$ + exclude: '.*test.*' + + - repo: https://github.com/PyCQA/flake8 + rev: 7.1.1 + hooks: + - id: flake8 + args: ['--extend-ignore=E501,E402,W503,E721','--max-line-length=100'] + files: \.(py)$ + exclude: '.*test.*' diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..45f0f04 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM alpine:latest + +WORKDIR /root + +RUN apk update && \ + apk add --no-cache \ + openssh \ + python3 \ + py3-pip \ + sqlite + +EXPOSE 22 + +RUN mkdir /root/src + +COPY ./startup.sh /root +COPY ./requirements.txt /root +COPY ./src/ /root/src/ + +RUN ls -la + +VOLUME /root + +ENTRYPOINT ["/bin/sh", "/root/startup.sh"] diff --git a/README.md b/README.md index c934cae..f0a6c4a 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -# predictify \ No newline at end of file +# Predictify + +## Overview + +A Data analysis tool to scrape your Spotify History usage and let a ML-Model predict your next songs + +## Authentication API + +[Official Documentation](https://developer.spotify.com/documentation/web-api/tutorials/getting-started) +[Authorization Code Flow](https://developer.spotify.com/documentation/web-api/tutorials/code-flow) + +## Usable possible APIs + +Recently Played Tracks: /me/player/recently-played [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-recently-played) + +Get Track: /tracks/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-track) + +Get Track's Audio Features - Deprecated: /audio-features/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features) + +Get Track's Audio Analysis - Deprecated: /audio-analysis/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis) + +Get Artist: /artists/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-an-artist) + +## Authors + +[Chris Kiriakou](https://github.com/ckiri) +[Dominik Agres](https://github.com/agresdominik) diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..46f840e --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +# pytest.ini +[pytest] +# Set the root directory to the current directory (.) +rootdir = . +pythonpath = . diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..89f3571 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +python-dotenv==1.0.1 +requests==2.32.3 +pre-commit==4.1.0 +pytest==8.3.5 +coverage==7.7.0 +pytest-cov==6.0.0 diff --git a/src/auth.py b/src/auth.py new file mode 100644 index 0000000..7f714da --- /dev/null +++ b/src/auth.py @@ -0,0 +1,257 @@ +import base64 +import json +import logging as log +import os +import time +from http.server import BaseHTTPRequestHandler, HTTPServer +from urllib.parse import parse_qs, urlencode, urlparse + +import dotenv +import requests + +TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env', 'tokens.json') + + +def simple_authenticate(grant_type: str = "client_credentials") -> str: + """ + This function authenticates the user and returns the access token + + :return: str + """ + spotify_client_id, spotify_client_secret, spotify_redirect_uri = _read_env_file() + token_url = "https://accounts.spotify.com/api/token" + auth_value = f"{spotify_client_id}:{spotify_client_secret}" + auth_header = base64.b64encode(auth_value.encode('utf-8')).decode('utf-8') + + headers = { + "Authorization": f"Basic {auth_header}", + "Content-Type": "application/x-www-form-urlencoded" + } + + data = { + "grant_type": f"{grant_type}" + } + + response = requests.post(token_url, headers=headers, data=data) + + if response.status_code == 200: + access_token = response.json().get('access_token') + return access_token + else: + log.error(f"Error {response.status_code}: {response.text}") + + +def authenticate(scope: str) -> str: + """ + This function authenticates the user and returns the access token + + :param scope: str + :return: str + """ + spotify_client_id, spotify_client_secret, spotify_redirect_uri = _read_env_file() + + tokens = _load_tokens(scope) + if tokens: + access_token, refresh_token, expires_at = tokens + if time.time() < expires_at: + return access_token + else: + log.info(f"Token for scope {scope} expired, refreshing...") + access_token, expires_at = _refresh_access_token(refresh_token, spotify_client_id, spotify_client_secret) + _refresh_tokens_file(access_token, scope, expires_at) + return access_token + + auth_url = _get_authorization_url(spotify_client_id, spotify_redirect_uri, scope) + print(f'Please go to the following URL to authorize the app: {auth_url}') + + authorization_code = _start_server_and_wait_for_code() + + access_token, refresh_token, expires_at = _exchange_code_for_token(authorization_code, redirect_uri=spotify_redirect_uri, + client_id=spotify_client_id, client_secret=spotify_client_secret) + + _save_tokens(access_token, refresh_token, scope, expires_at) + + return access_token + + +def _get_authorization_url(client_id: str, redirect_uri: str, scope: str) -> str: + """ + This function generates the URL that the user needs to visit to authorize the app + + :param client_id: str + :param redirect_uri: str + :param scope: str + :return: str + """ + + auth_params = { + "response_type": "code", + "client_id": client_id, + "scope": scope, + "redirect_uri": redirect_uri, + "state": str(int(time.time())) + } + auth_url = "https://accounts.spotify.com/authorize?" + urlencode(auth_params) + return auth_url + + +def _read_env_file() -> tuple: + """ + This function reads the .env file and returns the client_id, client_secret and redirect_uri + + :return: tuple + """ + current_dir = os.path.dirname(os.path.abspath(__file__)) + dotenv_folder_path = os.path.join(current_dir, 'env') + dotenv_path = os.path.join(dotenv_folder_path, '.env') + contents = dotenv.dotenv_values(dotenv_path=dotenv_path) + spotify_client_id = contents['SPOTIFY_CLIENT_ID'] + spotify_client_secret = contents['SPOTIFY_CLIENT_SECRET'] + spotify_redirect_uri = contents['SPOTIFY_REDIRECT_URI'] + return spotify_client_id, spotify_client_secret, spotify_redirect_uri + + +def _start_server_and_wait_for_code() -> any: + """ + This function starts a server and waits for the user to visit the authorization URL + and get the authorization code + + :return: any + """ + class CallbackHandler(BaseHTTPRequestHandler): + def do_GET(self): + parsed_url = urlparse(self.path) + query_params = parse_qs(parsed_url.query) + if 'code' in query_params: + self.server.authorization_code = query_params['code'][0] + self.send_response(200) + self.end_headers() + self.wfile.write(b"Authorization successful! You can close this window.") + + server = HTTPServer(('localhost', 8888), CallbackHandler) + log.info("Starting server to capture the authorization code...") + server.handle_request() + return server.authorization_code + + +def _exchange_code_for_token(code: str, redirect_uri: str, client_id: str, client_secret: str) -> tuple: + """ + This function exchanges the authorization code for an access token + + :param code: str + :param redirect_uri: str + :param client_id: str + :param client_secret: str + :return: tuple + """ + + token_url = "https://accounts.spotify.com/api/token" + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + data = { + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': redirect_uri, + 'client_id': client_id, + 'client_secret': client_secret, + } + + response = requests.post(token_url, data=data, headers=headers) + response_data = response.json() + + if 'access_token' not in response_data: + raise Exception("Failed to get access token") + + access_token = response_data['access_token'] + refresh_token = response_data.get('refresh_token', None) + expires_in = response_data['expires_in'] + expires_at = time.time() + expires_in + return access_token, refresh_token, expires_at + + +def _refresh_access_token(refresh_token: str, client_id: str, client_secret: str) -> tuple: + """ + Refreshes the access token using the refresh token. + + :param refresh_token: str + :param client_id: str + :param client_secret: str + :return: tuple + """ + token_url = "https://accounts.spotify.com/api/token" + headers = { + 'Content-Type': 'application/x-www-form-urlencoded', + } + + data = { + 'grant_type': 'refresh_token', + 'refresh_token': refresh_token, + 'client_id': client_id, + 'client_secret': client_secret, + } + + response = requests.post(token_url, data=data, headers=headers) + response_data = response.json() + + if 'access_token' not in response_data: + raise Exception("Failed to refresh access token") + + access_token = response_data['access_token'] + expires_in = response_data['expires_in'] + expires_at = time.time() + expires_in + return access_token, expires_at + + +def _load_tokens(scope: str) -> tuple: + """ + Loads the tokens from the local file if they exist and are still valid. + + :return: tuple or None + """ + if os.path.exists(TOKEN_FILE_PATH): + with open(TOKEN_FILE_PATH, 'r') as f: + tokens = json.load(f) + if scope in tokens: + if 'access_token' in tokens[scope] and 'expires_at' in tokens[scope] and 'expires_at' in tokens[scope]: + return tokens[scope]['access_token'], tokens[scope]['refresh_token'], tokens[scope]['expires_at'] + return None + + +def _save_tokens(access_token: str, refresh_token: str, scope: str, expires_at) -> None: + """ + Saves the access and refresh tokens to a local file. + + :param access_token: str + :param refresh_token: str + :param scope: str + """ + tokens = { + scope: { + 'access_token': access_token, + 'refresh_token': refresh_token, + 'expires_at': expires_at + }, + } + with open(TOKEN_FILE_PATH, 'w') as f: + json.dump(tokens, f) + + +def _refresh_tokens_file(access_token: str, scope: str, expires_at) -> None: + """ + Saves the access and refresh tokens to a local file. + + :param access_token: str + :param scope: str + """ + with open(TOKEN_FILE_PATH, 'r') as file: + tokens = json.load(file) + + if scope in tokens and 'refresh_token' in tokens[scope]: + tokens[scope]['access_token'] = access_token + tokens[scope]['expires_at'] = expires_at + with open(TOKEN_FILE_PATH, 'w') as file: + json.dump(tokens, file, indent=4) + else: + log.error(f"Error: Scope '{scope}' or refresh_token not found in the tokens file.") diff --git a/src/database_handler.py b/src/database_handler.py new file mode 100644 index 0000000..5a989dc --- /dev/null +++ b/src/database_handler.py @@ -0,0 +1,126 @@ +import logging as log +import sqlite3 +from enum import Enum + + +class Table(Enum): + TRACK_INFORMATION = "track_information" + ARTIST_INFORMATION = "artist_information" + ALBUM_INFORMATION = "album_information" + TRACK_ATTRIBUTES = "track_attributes" + RECENTLY_PLAYED = "recently_played" + + +class Database: + """ + A class to handle the database connection and operations + """ + + def __init__(self, db_name): + """Initialize the connection to the database""" + self.db_name = db_name + self.conn = sqlite3.connect(db_name) + self.cursor = self.conn.cursor() + self.create_tables() + + def create_tables(self): + """Create the tables in the database""" + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.TRACK_INFORMATION.value} ( + track_id TEXT PRIMARY KEY, + title TEXT, + duration_ms INTEGER, + explicit BOOLEAN, + popularity INTEGER + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.ARTIST_INFORMATION.value} ( + artist_id TEXT PRIMARY KEY, + artist_name TEXT, + followers INTEGER, + genres TEXT, + popularity INTEGER + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.ALBUM_INFORMATION.value} ( + album_id TEXT PRIMARY KEY, + album_name TEXT, + album_type TEXT, + total_tracks INTEGER, + release_date TEXT, + label TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} ( + track_id TEXT PRIMARY KEY, + attribute_name TEXT, + attribute_value TEXT + ); + ''') + + self.cursor.execute(f''' + CREATE TABLE IF NOT EXISTS {Table.RECENTLY_PLAYED.value} ( + played_at TIMESTAMP PRIMARY KEY, + track_id TEXT, + artist_id TEXT, + album_id TEXT, + FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id), + FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id), + FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id) + ); + ''') + + # Commit the changes + self.conn.commit() + + def add_row(self, table: Table, values): + """Add a new row into the specified table""" + try: + placeholders = ', '.join(['?'] * len(values)) + query = f"INSERT INTO {table.value} VALUES ({placeholders})" + self.cursor.execute(query, values) + self.conn.commit() + except Exception as e: + log.debug(f"Error: {e}") + + def read_all_rows(self, table: Table, column: str = "*"): + """Read all rows from the specified table""" + self.cursor.execute(f"SELECT {column} FROM {table.value}") + rows = self.cursor.fetchall() + return rows + + def close(self): + """Close the database connection""" + self.conn.close() + + def get_total_overview(self) -> list: + """Retrieve a total overview of all recently played songs with full details""" + try: + # Join recently_played with track_information, artist_information, and album_information + query = f''' + SELECT rp.played_at, + ti.track_id, + ti.title, + ai.artist_id, + ai.artist_name, + al.album_id, + al.album_name + FROM {Table.RECENTLY_PLAYED.value} rp + JOIN {Table.TRACK_INFORMATION.value} ti ON rp.track_id = ti.track_id + JOIN {Table.ARTIST_INFORMATION.value} ai ON rp.artist_id = ai.artist_id + JOIN {Table.ALBUM_INFORMATION.value} al ON rp.album_id = al.album_id + ORDER BY rp.played_at DESC + ''' + self.cursor.execute(query) + rows = self.cursor.fetchall() + return rows + except Exception as e: + log.error(f"Error retrieving total overview: {e}") + return [] diff --git a/src/env/.env.example b/src/env/.env.example new file mode 100644 index 0000000..545e283 --- /dev/null +++ b/src/env/.env.example @@ -0,0 +1,3 @@ +SPOTIFY_CLIENT_ID=your_token_here +SPOTIFY_CLIENT_SECRET=your_token_here +SPOTIFY_REDIRECT_URI=http://localhost:8888/callback diff --git a/src/runtime.py b/src/runtime.py new file mode 100644 index 0000000..9903d98 --- /dev/null +++ b/src/runtime.py @@ -0,0 +1,8 @@ +from time import sleep + +from scraper import scraping + +# Run forever on intervals of 30 minutes +while True: + scraping() + sleep(1800) diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..794b444 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,150 @@ +import requests + +from auth import authenticate, simple_authenticate +from database_handler import Database, Table + +db = Database('spotify_scraped.db') + + +def scraping(): + """ + This function is the main function that will be executed when the script is run + """ + global db + + scope = "user-read-recently-played" + bearer_token = authenticate(scope) + + # Once each 30 mins + _read_recently_played_page_and_add_to_db(bearer_token=bearer_token) + _scrape_missing_infos() + + db.close() + + +def _read_recently_played_page_and_add_to_db(bearer_token: str): + """ + """ + global db + + last_played_track = _get_last_played_track(bearer_token=bearer_token) + + for track in last_played_track['items']: + track_id = track['track']['id'] + played_at = track['played_at'] + album_id = track['track']['album']['id'] + artist_id = track['track']['artists'][0]['id'] + db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id)) + + +def _get_last_played_track(url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50", bearer_token: str = "") -> dict: + """ + This function returns the last played track based on the limit size + + :param limit: str + :param bearer_token: str + :return: dict + """ + + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def _get_track_information(track_id: str, bearer_token: str) -> dict: + """ + This function returns the track information based on the track id + + :param track_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/tracks/{track_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def _get_artist_information(artist_id: str, bearer_token: str) -> dict: + """ + This function returns the artist information based on the artist id + + :param artist_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/artists/{artist_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def _get_album_information(album_id: str, bearer_token: str) -> dict: + """ + This function returns the album information based on the album id + + :param album_id: str + :param bearer_token: str + :return: dict + """ + + url = f"https://api.spotify.com/v1/albums/{album_id}" + header = { + 'Authorization': f'Bearer {bearer_token}' + } + + response = requests.get(url, headers=header) + response_json = response.json() + return response_json + + +def _scrape_missing_infos(): + """ + """ + global db + + bearer_token_simple = simple_authenticate() + + # Track Info + all_track_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id') + all_track_ids_saved = db.read_all_rows(Table.TRACK_INFORMATION, 'track_id') + all_track_ids_missing = list(set(all_track_ids_recently_played) - set(all_track_ids_saved)) + for track_id in all_track_ids_missing: + response = _get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple) + db.add_row(Table.TRACK_INFORMATION, (response['id'], response['name'], response['duration_ms'], response['explicit'], response['popularity'])) + # Album Info + all_album_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'album_id') + all_album_ids_saved = db.read_all_rows(Table.ALBUM_INFORMATION, 'album_id') + all_album_ids_missing = list(set(all_album_ids_recently_played) - set(all_album_ids_saved)) + for album_id in all_album_ids_missing: + response = _get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple) + try: + release_year = response['release_date'][:4] + except Exception: + release_year = "" + db.add_row(Table.ALBUM_INFORMATION, (response['id'], response['name'], response['album_type'], response['total_tracks'], release_year, response['label'])) + # Artist Info + all_artist_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'artist_id') + all_artist_ids_saved = db.read_all_rows(Table.ARTIST_INFORMATION, 'artist_id') + all_artist_ids_missing = list(set(all_artist_ids_recently_played) - set(all_artist_ids_saved)) + for artist_id in all_artist_ids_missing: + response = _get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple) + try: + genre = response['genres'][0] + except IndexError: + genre = "" + db.add_row(Table.ARTIST_INFORMATION, (response['id'], response['name'], response['followers']['total'], genre, response['popularity'])) diff --git a/startup.sh b/startup.sh new file mode 100755 index 0000000..5fb7fd1 --- /dev/null +++ b/startup.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Starup the predictify scraper + +if test -f ./requirements.txt +then + python3 -m venv .venv + .venv/bin/pip install -r ./requirements.txt +else + printf "Missing requirements file! aborting...\n" + exit 1 +fi + +.venv/bin/python3 src/scraper.py diff --git a/test/test_auth.py b/test/test_auth.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_scraper.py b/test/test_scraper.py new file mode 100644 index 0000000..e69de29