mirror of
https://github.com/agresdominik/predictify.git
synced 2026-04-21 17:55:49 +00:00
Merge pull request #11 from agresdominik/feat/scraping_implementation
Feat/scraping implementation
This commit is contained in:
+10
-2
@@ -1,3 +1,12 @@
|
|||||||
|
# databases
|
||||||
|
*.db
|
||||||
|
|
||||||
|
# Custom Tokens file/rotator
|
||||||
|
tokens.json
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
.vscode/
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
@@ -129,10 +138,9 @@ celerybeat.pid
|
|||||||
|
|
||||||
# Environments
|
# Environments
|
||||||
.env
|
.env
|
||||||
|
!.env.example
|
||||||
.venv
|
.venv
|
||||||
env/
|
|
||||||
venv/
|
venv/
|
||||||
ENV/
|
|
||||||
env.bak/
|
env.bak/
|
||||||
venv.bak/
|
venv.bak/
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
# .pre-commit-config.yaml
|
||||||
|
|
||||||
|
repos:
|
||||||
|
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v5.0.0
|
||||||
|
hooks:
|
||||||
|
- id: trailing-whitespace # Remove trailing whitespace
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(py)$
|
||||||
|
|
||||||
|
- id: end-of-file-fixer # Ensure a single newline at the end of a file
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(py)$
|
||||||
|
|
||||||
|
- id: check-yaml # Check if the YAML files are valid
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(yaml|yml)$
|
||||||
|
|
||||||
|
- id: check-json # Check if the JSON files are valid
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(json)$
|
||||||
|
|
||||||
|
- id: check-added-large-files # Prevent large files from being committed
|
||||||
|
args: ['--maxkb=1000']
|
||||||
|
|
||||||
|
- id: check-ast # Check for parse errors in Python files
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(py)$
|
||||||
|
|
||||||
|
- id: debug-statements # Check for print statements and pdb calls
|
||||||
|
exclude: '.*test.*'
|
||||||
|
files: \.(py)$
|
||||||
|
|
||||||
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 5.13.2
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
args: ['--profile=black']
|
||||||
|
files: \.(py)$
|
||||||
|
exclude: '.*test.*'
|
||||||
|
|
||||||
|
- repo: https://github.com/PyCQA/flake8
|
||||||
|
rev: 7.1.1
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
args: ['--extend-ignore=E501,E402,W503,E721','--max-line-length=100']
|
||||||
|
files: \.(py)$
|
||||||
|
exclude: '.*test.*'
|
||||||
@@ -1 +1,27 @@
|
|||||||
# predictify
|
# Predictify
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
A Data analysis tool to scrape your Spotify History usage and let a ML-Model predict your next songs
|
||||||
|
|
||||||
|
## Authentication API
|
||||||
|
|
||||||
|
[Official Documentation](https://developer.spotify.com/documentation/web-api/tutorials/getting-started)
|
||||||
|
[Authorization Code Flow](https://developer.spotify.com/documentation/web-api/tutorials/code-flow)
|
||||||
|
|
||||||
|
## Usable possible APIs
|
||||||
|
|
||||||
|
Recently Played Tracks: /me/player/recently-played [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-recently-played)
|
||||||
|
|
||||||
|
Get Track: /tracks/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-track)
|
||||||
|
|
||||||
|
Get Track's Audio Features - Deprecated: /audio-features/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features)
|
||||||
|
|
||||||
|
Get Track's Audio Analysis - Deprecated: /audio-analysis/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis)
|
||||||
|
|
||||||
|
Get Artist: /artists/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-an-artist)
|
||||||
|
|
||||||
|
## Authors
|
||||||
|
|
||||||
|
[Chris Kiriakou](https://github.com/ckiri)
|
||||||
|
[Dominik Agres](https://github.com/agresdominik)
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
# pytest.ini
|
||||||
|
[pytest]
|
||||||
|
# Set the root directory to the current directory (.)
|
||||||
|
rootdir = .
|
||||||
|
pythonpath = .
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
python-dotenv==1.0.1
|
||||||
|
requests==2.32.3
|
||||||
|
pre-commit==4.1.0
|
||||||
|
pytest==8.3.5
|
||||||
|
coverage==7.7.0
|
||||||
|
pytest-cov==6.0.0
|
||||||
+203
@@ -0,0 +1,203 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
|
from urllib.parse import parse_qs, urlencode, urlparse
|
||||||
|
|
||||||
|
import dotenv
|
||||||
|
import requests
|
||||||
|
|
||||||
|
TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env', 'tokens.json')
|
||||||
|
|
||||||
|
|
||||||
|
def authenticate(scope: str) -> str:
|
||||||
|
"""
|
||||||
|
This function authenticates the user and returns the access token
|
||||||
|
|
||||||
|
:param scope: str
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
spotify_client_id, spotify_client_secret, spotify_redirect_uri = _read_env_file()
|
||||||
|
|
||||||
|
tokens = _load_tokens()
|
||||||
|
if tokens:
|
||||||
|
access_token, refresh_token, expires_at = tokens
|
||||||
|
if time.time() < expires_at:
|
||||||
|
return access_token
|
||||||
|
else:
|
||||||
|
print("Token expired, refreshing...")
|
||||||
|
access_token, refresh_token = _refresh_access_token(refresh_token, spotify_client_id, spotify_client_secret)
|
||||||
|
_save_tokens(access_token, refresh_token)
|
||||||
|
return access_token
|
||||||
|
|
||||||
|
auth_url = _get_authorization_url(spotify_client_id, spotify_redirect_uri, scope)
|
||||||
|
print(f'Please go to the following URL to authorize the app: {auth_url}')
|
||||||
|
|
||||||
|
authorization_code = _start_server_and_wait_for_code()
|
||||||
|
|
||||||
|
access_token, refresh_token = _exchange_code_for_token(authorization_code, redirect_uri=spotify_redirect_uri,
|
||||||
|
client_id=spotify_client_id, client_secret=spotify_client_secret)
|
||||||
|
|
||||||
|
_save_tokens(access_token, refresh_token)
|
||||||
|
|
||||||
|
return access_token
|
||||||
|
|
||||||
|
|
||||||
|
def _get_authorization_url(client_id: str, redirect_uri: str, scope: str) -> str:
|
||||||
|
"""
|
||||||
|
This function generates the URL that the user needs to visit to authorize the app
|
||||||
|
|
||||||
|
:param client_id: str
|
||||||
|
:param redirect_uri: str
|
||||||
|
:param scope: str
|
||||||
|
:return: str
|
||||||
|
"""
|
||||||
|
|
||||||
|
auth_params = {
|
||||||
|
"response_type": "code",
|
||||||
|
"client_id": client_id,
|
||||||
|
"scope": scope,
|
||||||
|
"redirect_uri": redirect_uri,
|
||||||
|
"state": str(int(time.time()))
|
||||||
|
}
|
||||||
|
auth_url = "https://accounts.spotify.com/authorize?" + urlencode(auth_params)
|
||||||
|
return auth_url
|
||||||
|
|
||||||
|
|
||||||
|
def _read_env_file() -> tuple:
|
||||||
|
"""
|
||||||
|
This function reads the .env file and returns the client_id, client_secret and redirect_uri
|
||||||
|
|
||||||
|
:return: tuple
|
||||||
|
"""
|
||||||
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
dotenv_folder_path = os.path.join(current_dir, 'env')
|
||||||
|
dotenv_path = os.path.join(dotenv_folder_path, '.env')
|
||||||
|
contents = dotenv.dotenv_values(dotenv_path=dotenv_path)
|
||||||
|
spotify_client_id = contents['SPOTIFY_CLIENT_ID']
|
||||||
|
spotify_client_secret = contents['SPOTIFY_CLIENT_SECRET']
|
||||||
|
spotify_redirect_uri = contents['SPOTIFY_REDIRECT_URI']
|
||||||
|
return spotify_client_id, spotify_client_secret, spotify_redirect_uri
|
||||||
|
|
||||||
|
|
||||||
|
def _start_server_and_wait_for_code() -> any:
|
||||||
|
"""
|
||||||
|
This function starts a server and waits for the user to visit the authorization URL
|
||||||
|
and get the authorization code
|
||||||
|
|
||||||
|
:return: any
|
||||||
|
"""
|
||||||
|
class CallbackHandler(BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
parsed_url = urlparse(self.path)
|
||||||
|
query_params = parse_qs(parsed_url.query)
|
||||||
|
if 'code' in query_params:
|
||||||
|
self.server.authorization_code = query_params['code'][0]
|
||||||
|
self.send_response(200)
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(b"Authorization successful! You can close this window.")
|
||||||
|
|
||||||
|
server = HTTPServer(('localhost', 8888), CallbackHandler)
|
||||||
|
print("Starting server to capture the authorization code...")
|
||||||
|
server.handle_request()
|
||||||
|
return server.authorization_code
|
||||||
|
|
||||||
|
|
||||||
|
def _exchange_code_for_token(code: str, redirect_uri: str, client_id: str, client_secret: str) -> tuple:
|
||||||
|
"""
|
||||||
|
This function exchanges the authorization code for an access token
|
||||||
|
|
||||||
|
:param code: str
|
||||||
|
:param redirect_uri: str
|
||||||
|
:param client_id: str
|
||||||
|
:param client_secret: str
|
||||||
|
:return: tuple
|
||||||
|
"""
|
||||||
|
|
||||||
|
token_url = "https://accounts.spotify.com/api/token"
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/x-www-form-urlencoded',
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'grant_type': 'authorization_code',
|
||||||
|
'code': code,
|
||||||
|
'redirect_uri': redirect_uri,
|
||||||
|
'client_id': client_id,
|
||||||
|
'client_secret': client_secret,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(token_url, data=data, headers=headers)
|
||||||
|
response_data = response.json()
|
||||||
|
|
||||||
|
if 'access_token' not in response_data:
|
||||||
|
raise Exception("Failed to get access token")
|
||||||
|
|
||||||
|
access_token = response_data['access_token']
|
||||||
|
refresh_token = response_data.get('refresh_token', None)
|
||||||
|
return access_token, refresh_token
|
||||||
|
|
||||||
|
|
||||||
|
def _refresh_access_token(refresh_token: str, client_id: str, client_secret: str) -> tuple:
|
||||||
|
"""
|
||||||
|
Refreshes the access token using the refresh token.
|
||||||
|
|
||||||
|
:param refresh_token: str
|
||||||
|
:param client_id: str
|
||||||
|
:param client_secret: str
|
||||||
|
:return: tuple
|
||||||
|
"""
|
||||||
|
token_url = "https://accounts.spotify.com/api/token"
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/x-www-form-urlencoded',
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'grant_type': 'refresh_token',
|
||||||
|
'refresh_token': refresh_token,
|
||||||
|
'client_id': client_id,
|
||||||
|
'client_secret': client_secret,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(token_url, data=data, headers=headers)
|
||||||
|
response_data = response.json()
|
||||||
|
|
||||||
|
if 'access_token' not in response_data:
|
||||||
|
raise Exception("Failed to refresh access token")
|
||||||
|
|
||||||
|
access_token = response_data['access_token']
|
||||||
|
expires_in = response_data['expires_in']
|
||||||
|
expires_at = time.time() + expires_in
|
||||||
|
return access_token, expires_at
|
||||||
|
|
||||||
|
|
||||||
|
def _load_tokens() -> tuple:
|
||||||
|
"""
|
||||||
|
Loads the tokens from the local file if they exist and are still valid.
|
||||||
|
|
||||||
|
:return: tuple or None
|
||||||
|
"""
|
||||||
|
if os.path.exists(TOKEN_FILE_PATH):
|
||||||
|
with open(TOKEN_FILE_PATH, 'r') as f:
|
||||||
|
tokens = json.load(f)
|
||||||
|
if 'access_token' in tokens and 'expires_at' in tokens and time.time() < tokens['expires_at']:
|
||||||
|
return tokens['access_token'], tokens['refresh_token'], tokens['expires_at']
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _save_tokens(access_token: str, refresh_token: str) -> None:
|
||||||
|
"""
|
||||||
|
Saves the access and refresh tokens to a local file.
|
||||||
|
|
||||||
|
:param access_token: str
|
||||||
|
:param refresh_token: str
|
||||||
|
"""
|
||||||
|
expires_in = 3600 # Default expiration time, adjust as needed
|
||||||
|
expires_at = time.time() + expires_in
|
||||||
|
tokens = {
|
||||||
|
'access_token': access_token,
|
||||||
|
'refresh_token': refresh_token,
|
||||||
|
'expires_at': expires_at
|
||||||
|
}
|
||||||
|
with open(TOKEN_FILE_PATH, 'w') as f:
|
||||||
|
json.dump(tokens, f)
|
||||||
@@ -0,0 +1,87 @@
|
|||||||
|
import sqlite3
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class Table(Enum):
|
||||||
|
TRACK_INFORMATION = "track_information"
|
||||||
|
ARTIST_INFORMATION = "artist_information"
|
||||||
|
ALBUM_INFORMATION = "album_information"
|
||||||
|
TRACK_ATTRIBUTES = "track_attributes"
|
||||||
|
RECENTLY_PLAYED = "recently_played"
|
||||||
|
|
||||||
|
|
||||||
|
class Database:
|
||||||
|
"""
|
||||||
|
A class to handle the database connection and operations
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db_name):
|
||||||
|
"""Initialize the connection to the database"""
|
||||||
|
self.db_name = db_name
|
||||||
|
self.conn = sqlite3.connect(db_name)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
self.create_tables()
|
||||||
|
|
||||||
|
def create_tables(self):
|
||||||
|
"""Create the tables in the database"""
|
||||||
|
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {Table.TRACK_INFORMATION.value} (
|
||||||
|
track_id TEXT PRIMARY KEY,
|
||||||
|
title TEXT
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {Table.ARTIST_INFORMATION.value} (
|
||||||
|
artist_id TEXT PRIMARY KEY,
|
||||||
|
artist_name TEXT
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {Table.ALBUM_INFORMATION.value} (
|
||||||
|
album_id TEXT PRIMARY KEY,
|
||||||
|
album_name TEXT
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
|
||||||
|
track_id TEXT PRIMARY KEY,
|
||||||
|
attribute_name TEXT,
|
||||||
|
attribute_value TEXT
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
self.cursor.execute(f'''
|
||||||
|
CREATE TABLE IF NOT EXISTS {Table.RECENTLY_PLAYED.value} (
|
||||||
|
played_at TIMESTAMP PRIMARY KEY,
|
||||||
|
track_id TEXT,
|
||||||
|
artist_id TEXT,
|
||||||
|
album_id TEXT,
|
||||||
|
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
|
||||||
|
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
|
||||||
|
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
# Commit the changes
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
def add_row(self, table: Table, values):
|
||||||
|
"""Add a new row into the specified table"""
|
||||||
|
placeholders = ', '.join(['?'] * len(values))
|
||||||
|
query = f"INSERT INTO {table.value} VALUES ({placeholders})"
|
||||||
|
self.cursor.execute(query, values)
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
def read_all_rows(self, table: Table, column: str = "*"):
|
||||||
|
"""Read all rows from the specified table"""
|
||||||
|
self.cursor.execute(f"SELECT {column} FROM {table.value}")
|
||||||
|
rows = self.cursor.fetchall()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
"""Close the database connection"""
|
||||||
|
self.conn.close()
|
||||||
Vendored
+3
@@ -0,0 +1,3 @@
|
|||||||
|
SPOTIFY_CLIENT_ID=your_token_here
|
||||||
|
SPOTIFY_CLIENT_SECRET=your_token_here
|
||||||
|
SPOTIFY_REDIRECT_URI=http://localhost:8888/callback
|
||||||
@@ -0,0 +1,84 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
from auth import authenticate
|
||||||
|
from database_handler import Database, Table
|
||||||
|
|
||||||
|
db = Database('spotify_scraped.db')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
This function is the main function that will be executed when the script is run
|
||||||
|
"""
|
||||||
|
global db
|
||||||
|
|
||||||
|
scope = "user-read-recently-played"
|
||||||
|
bearer_token = authenticate(scope)
|
||||||
|
|
||||||
|
# Once each 30 mins
|
||||||
|
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
|
||||||
|
|
||||||
|
# Once a day
|
||||||
|
all_track_ids = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id')
|
||||||
|
for track_id in all_track_ids:
|
||||||
|
response = _get_track_information(track_id=track_id, bearer_token=bearer_token)
|
||||||
|
print(response)
|
||||||
|
|
||||||
|
# Close the database connection
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _read_recently_played_page_and_add_to_db(bearer_token: str):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
global db
|
||||||
|
|
||||||
|
last_played_track = _get_last_played_track(bearer_token=bearer_token)
|
||||||
|
|
||||||
|
for track in last_played_track['items']:
|
||||||
|
track_id = track['track']['id']
|
||||||
|
played_at = track['played_at']
|
||||||
|
album_id = track['track']['album']['id']
|
||||||
|
artist_id = track['track']['artists'][0]['id']
|
||||||
|
db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_last_played_track(url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50", bearer_token: str = "") -> dict:
|
||||||
|
"""
|
||||||
|
This function returns the last played track based on the limit size
|
||||||
|
|
||||||
|
:param limit: str
|
||||||
|
:param bearer_token: str
|
||||||
|
:return: dict
|
||||||
|
"""
|
||||||
|
|
||||||
|
header = {
|
||||||
|
'Authorization': f'Bearer {bearer_token}'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=header)
|
||||||
|
response_json = response.json()
|
||||||
|
return response_json
|
||||||
|
|
||||||
|
|
||||||
|
def _get_track_information(track_id: str, bearer_token: str) -> dict:
|
||||||
|
"""
|
||||||
|
This function returns the track information based on the track id
|
||||||
|
|
||||||
|
:param track_id: str
|
||||||
|
:param bearer_token: str
|
||||||
|
:return: dict
|
||||||
|
"""
|
||||||
|
|
||||||
|
url = f"https://api.spotify.com/v1/tracks/{track_id}"
|
||||||
|
header = {
|
||||||
|
'Authorization': f'Bearer {bearer_token}'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=header)
|
||||||
|
response_json = response.json()
|
||||||
|
return response_json
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user