mirror of
https://github.com/agresdominik/predictify.git
synced 2026-04-21 17:55:49 +00:00
Merge pull request #30 from agresdominik/feat/audio_analysis
Feat/audio analysis
This commit is contained in:
@@ -0,0 +1,11 @@
|
||||
logs/
|
||||
data/
|
||||
src/__pycache__/
|
||||
.git
|
||||
*.md
|
||||
.venv
|
||||
LICENSE
|
||||
MAKEFILE
|
||||
pytest.ini
|
||||
test/
|
||||
|
||||
+25
-5
@@ -1,11 +1,31 @@
|
||||
# Machine Learning grid search
|
||||
my_dir/
|
||||
|
||||
# Audio previews
|
||||
audio_previews/
|
||||
|
||||
# Audio data files
|
||||
audio_features*
|
||||
audio_data/
|
||||
|
||||
# My testing file
|
||||
main_test.py
|
||||
|
||||
# .db
|
||||
*.db
|
||||
|
||||
# DS_Store
|
||||
.DS_Store
|
||||
|
||||
# Gdpr Data file
|
||||
Streaming_History*
|
||||
|
||||
# Test running file
|
||||
main_test.py
|
||||
|
||||
# databases
|
||||
*.db
|
||||
|
||||
# Custom Tokens file/rotator
|
||||
tokens.json
|
||||
# data dir
|
||||
data/*
|
||||
data-docker/
|
||||
|
||||
# Visual Studio Code
|
||||
.vscode/
|
||||
|
||||
@@ -22,7 +22,7 @@ repos:
|
||||
files: \.(json)$
|
||||
|
||||
- id: check-added-large-files # Prevent large files from being committed
|
||||
args: ['--maxkb=1000']
|
||||
args: ['--maxkb=2000']
|
||||
|
||||
- id: check-ast # Check for parse errors in Python files
|
||||
exclude: '.*test.*'
|
||||
|
||||
-24
@@ -1,24 +0,0 @@
|
||||
FROM alpine:latest
|
||||
|
||||
WORKDIR /root
|
||||
|
||||
RUN apk update && \
|
||||
apk add --no-cache \
|
||||
openssh \
|
||||
python3 \
|
||||
py3-pip \
|
||||
sqlite
|
||||
|
||||
EXPOSE 22
|
||||
|
||||
RUN mkdir /root/src
|
||||
|
||||
COPY ./startup.sh /root
|
||||
COPY ./requirements.txt /root
|
||||
COPY ./src/ /root/src/
|
||||
|
||||
RUN ls -la
|
||||
|
||||
VOLUME /root
|
||||
|
||||
ENTRYPOINT ["/bin/sh", "/root/startup.sh"]
|
||||
@@ -0,0 +1,19 @@
|
||||
.PHONY: all dockerfile clean
|
||||
|
||||
TAG="unstable"
|
||||
PROJ_NAME="predictify"
|
||||
|
||||
all: install dockerfile
|
||||
|
||||
install:
|
||||
mkdir -p ./data
|
||||
|
||||
dockerfile: ./docker/Dockerfile
|
||||
docker build \
|
||||
--tag "$(PROJ_NAME):$(TAG)" \
|
||||
--build-arg PROJ_NAME=$(PROJ_NAME) \
|
||||
--file ./docker/Dockerfile \
|
||||
.
|
||||
|
||||
clean: ./spotify_scraped.db
|
||||
rm -r ./data/spotify_scraped.db
|
||||
@@ -11,15 +11,45 @@ A Data analysis tool to scrape your Spotify History usage and let a ML-Model pre
|
||||
|
||||
## Usable possible APIs
|
||||
|
||||
Recently Played Tracks: /me/player/recently-played [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-recently-played)
|
||||
Recently Played Tracks: `/me/player/recently-played` [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-recently-played)
|
||||
|
||||
Get Track: /tracks/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-track)
|
||||
Get Track: `/tracks/{id}` [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-track)
|
||||
|
||||
Get Track's Audio Features - Deprecated: /audio-features/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features)
|
||||
Get Track's Audio Features _(Deprecated)_: `/audio-features/{id}` [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-features)
|
||||
|
||||
Get Track's Audio Analysis - Deprecated: /audio-analysis/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis)
|
||||
Get Track's Audio Analysis _(Deprecated)_: `/audio-analysis/{id}` [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-audio-analysis)
|
||||
|
||||
Get Artist: /artists/{id} [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-an-artist)
|
||||
Get Artist: `/artists/{id}` [Official Spotify Documentation](https://developer.spotify.com/documentation/web-api/reference/get-an-artist)
|
||||
|
||||
## Docker usage
|
||||
|
||||
`cd` inside the projects directory:
|
||||
```sh
|
||||
cd predictify
|
||||
```
|
||||
To run predictify inside a container, first make sure to build the image:
|
||||
```sh
|
||||
make dockerfile
|
||||
```
|
||||
Create a seperate data directory (e.g. `data-docker`):
|
||||
```sh
|
||||
mkdir data-docker
|
||||
```
|
||||
> [!NOTE]
|
||||
> To detatch the container to run it in the background add the `--detach` directly after the `run` command.
|
||||
Then run the following docker command, to run the container in the foreground:
|
||||
```sh
|
||||
docker run \
|
||||
--name predictify \
|
||||
--network=host \
|
||||
--volume $(pwd)/data-docker:/app/predictify/data \
|
||||
--volume $(pwd)/config:/app/predictify/config \
|
||||
predictify:unstable
|
||||
```
|
||||
|
||||
## GDPR Data
|
||||
|
||||
If you have gdpr data, create a folder: ```data/gdpr_data``` and add all .json files containing your play history into it. In order to extract it, run the script: ```python3 src/runtime.py --export```
|
||||
|
||||
## Authors
|
||||
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
FROM alpine:3.21.3
|
||||
|
||||
# Set environment variables
|
||||
ARG PROJ_NAME
|
||||
ENV PROJ_NAME=${PROJ_NAME}
|
||||
|
||||
RUN mkdir -p /app/${PROJ_NAME}
|
||||
|
||||
# The following steps are executed from the specified directory below
|
||||
WORKDIR /app/${PROJ_NAME}
|
||||
|
||||
# Install all necessary software
|
||||
RUN apk add --no-cache python3 sqlite
|
||||
|
||||
# Create the directories, needed for persistent storage (e.g. database, tokens)
|
||||
RUN mkdir ./data ./src ./config
|
||||
|
||||
# Create mount points for logs, data, src and config
|
||||
VOLUME /var/log ./data ./src ./config
|
||||
|
||||
# Copy the application source code
|
||||
COPY ./src/ ./src/
|
||||
|
||||
# Create a seperate venv inside the container & install requirements
|
||||
COPY ./requirements.txt ./requirements.txt
|
||||
RUN \
|
||||
python -m venv .venv && \
|
||||
source .venv/bin/activate && \
|
||||
./.venv/bin/pip install -r ./requirements.txt && \
|
||||
deactivate
|
||||
|
||||
COPY ./docker/startup.sh ./startup.sh
|
||||
|
||||
# When starting the contianer the following is executed
|
||||
ENTRYPOINT ["./startup.sh"]
|
||||
Executable
+5
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Startup predictify. Don't use this. This is for docker specifically.
|
||||
source .venv/bin/activate
|
||||
.venv/bin/python src/runtime.py --export
|
||||
@@ -4,3 +4,14 @@ pre-commit==4.1.0
|
||||
pytest==8.3.5
|
||||
coverage==7.7.0
|
||||
pytest-cov==6.0.0
|
||||
pandas==2.2.3
|
||||
numpy==1.26.4
|
||||
scikit-learn==1.6.1
|
||||
tensorflow==2.19.0
|
||||
keras==3.9.2
|
||||
keras-tuner==1.4.7
|
||||
scikeras==0.13.0
|
||||
matplotlib==3.10.1
|
||||
seaborn==0.13.2
|
||||
librosa==0.11.0
|
||||
optuna==4.2.1
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,28 @@
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
def get_spotify_preview_url(spotify_track_id: str) -> Optional[str]:
|
||||
"""
|
||||
Get the preview URL for a Spotify track using the embed page workaround.
|
||||
|
||||
Args:
|
||||
spotify_track_id (str): The Spotify track ID
|
||||
|
||||
Returns:
|
||||
Optional[str]: The preview URL if found, else None
|
||||
"""
|
||||
try:
|
||||
embed_url = f"https://open.spotify.com/embed/track/{spotify_track_id}"
|
||||
response = requests.get(embed_url)
|
||||
response.raise_for_status()
|
||||
|
||||
html = response.text
|
||||
match = re.search(r'"audioPreview":\s*{\s*"url":\s*"([^"]+)"', html)
|
||||
return match.group(1) if match else None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to fetch Spotify preview URL: {e}")
|
||||
return None
|
||||
@@ -0,0 +1,391 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 7814.41track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 8865.11track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 8410.16track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 10286.20track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6751.92track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 7016.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 9608.71track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 569.98track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 8934.23track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 3487.43track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 8381.08track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 3057.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 6150.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 6555.71track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 2342.34track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 9073.67track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 6341.27track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 4801.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 4224.31track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 7571.09track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 91/91 [00:00<00:00, 6534.41track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 7016.58track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 7011.93track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 7224.25track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 5970.09track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 1830.87track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 7771.45track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 3839.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 8010.83track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 7/7 [00:00<00:00, 1725.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 80/80 [00:00<00:00, 3127.45track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 5919.12track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 2211.42track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 5711.20track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 5389.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 5007.79track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 5448.83track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 1677.91track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 5254.51track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 5087.50track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 6186.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 1513.61track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 6105.52track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 4209.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 1611.84track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 127.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 200.62track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 5717.10track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 3484.29track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 177.04track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 5664.96track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 239.08track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 223.04track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 5842.92track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 7040.71track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 7355.77track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 292.89track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 8041.64track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 420.54track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 6490.87track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 5549.89track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 5031.36track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 1444.37track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 5870.31track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 4974.82track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 4823.21track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6310.05track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 196/196 [00:00<00:00, 312.44track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 5850.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 4904.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 5343.90track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 4764.65track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 4891.16track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 280.38track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 4945.14track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 4609.60track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 1155.63track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 3454.36track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 4191.60track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 4414.67track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 4393.90track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 2788.99track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 6180.40track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 260.50track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 4974.38track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 204.43track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 189/189 [00:00<00:00, 433.69track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 4620.28track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 5229.06track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:00<00:00, 6571.83track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 252.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 7138.69track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 4936.31track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 5408.81track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 6418.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 6733.21track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 6277.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 168.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 5975.06track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 7002.79track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 6256.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 6033.96track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 283.78track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 6277.83track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 5573.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:00<00:00, 6510.58track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6384.23track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6124.12track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6541.53track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:00<00:00, 857.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:00<00:00, 375.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 10254.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 6399.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 6457.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:00<00:00, 237.51track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 6714.17track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 287.82track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 6351.42track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:00<00:00, 7704.99track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:00<00:00, 449.76track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 6541.76track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:00<00:00, 7323.53track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:00<00:00, 465.08track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:15<00:00, 6.16track/s] \n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:26<00:00, 3.60track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:25<00:00, 3.85track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:57<00:00, 3.34track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:27<00:00, 3.49track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 194/194 [00:53<00:00, 3.63track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:27<00:00, 3.58track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 187/187 [00:55<00:00, 3.35track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:29<00:00, 3.19track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 196/196 [00:57<00:00, 3.41track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:25<00:00, 3.63track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 197/197 [00:52<00:00, 3.75track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:51<00:00, 3.71track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:26<00:00, 3.69track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 194/194 [00:55<00:00, 3.50track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:28<00:00, 3.46track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:25<00:00, 3.69track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:25<00:00, 3.65track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:55<00:00, 3.46track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:26<00:00, 3.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:50<00:00, 3.74track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:25<00:00, 3.86track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:52<00:00, 3.63track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:28<00:00, 3.40track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 195/195 [00:55<00:00, 3.54track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:27<00:00, 3.44track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:27<00:00, 3.57track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:34<00:00, 2.81track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:27<00:00, 3.55track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.38track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:52<00:00, 3.64track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:32<00:00, 3.01track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:28<00:00, 3.36track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:24<00:00, 3.92track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.40track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 188/188 [00:49<00:00, 3.79track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:26<00:00, 3.53track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:55<00:00, 3.45track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.30track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:29<00:00, 3.23track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 90/90 [00:22<00:00, 3.93track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:25<00:00, 3.63track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:26<00:00, 3.60track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:26<00:00, 3.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 90/90 [00:24<00:00, 3.66track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:28<00:00, 3.38track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:27<00:00, 3.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:25<00:00, 3.74track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:25<00:00, 3.80track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:25<00:00, 3.69track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:27<00:00, 3.62track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:25<00:00, 3.71track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:26<00:00, 3.55track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:50<00:00, 3.83track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 197/197 [00:53<00:00, 3.67track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 185/185 [00:46<00:00, 4.01track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 195/195 [00:48<00:00, 4.03track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:51<00:00, 3.68track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:26<00:00, 3.64track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 197/197 [00:52<00:00, 3.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:24<00:00, 3.87track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 195/195 [01:04<00:00, 3.01track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:54<00:00, 3.57track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:28<00:00, 3.35track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 194/194 [00:55<00:00, 3.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 192/192 [00:59<00:00, 3.23track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.36track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:51<00:00, 3.67track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 189/189 [01:02<00:00, 3.01track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:28<00:00, 3.51track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 188/188 [00:55<00:00, 3.40track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:29<00:00, 3.19track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:38<00:00, 2.45track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 194/194 [00:55<00:00, 3.50track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:30<00:00, 3.13track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:27<00:00, 3.35track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 186/186 [00:56<00:00, 3.31track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:54<00:00, 3.52track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:40<00:00, 2.39track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:56<00:00, 1.64track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:54<00:00, 3.57track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 195/195 [01:04<00:00, 3.03track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:32<00:00, 2.93track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 192/192 [01:05<00:00, 2.92track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:31<00:00, 3.12track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:34<00:00, 2.82track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [00:55<00:00, 3.40track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:26<00:00, 3.49track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:53<00:00, 3.58track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:31<00:00, 3.03track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:28<00:00, 3.42track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 184/184 [00:50<00:00, 3.61track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 99/99 [00:31<00:00, 3.15track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:27<00:00, 3.42track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.33track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 189/189 [00:52<00:00, 3.60track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:27<00:00, 3.54track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:25<00:00, 3.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 91/91 [00:26<00:00, 3.47track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:27<00:00, 3.50track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 88/88 [00:23<00:00, 3.78track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:29<00:00, 3.35track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 186/186 [00:53<00:00, 3.46track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:29<00:00, 3.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:30<00:00, 3.13track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:32<00:00, 2.91track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 186/186 [00:56<00:00, 3.27track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:27<00:00, 3.34track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:24<00:00, 3.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:26<00:00, 3.56track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 186/186 [00:53<00:00, 3.46track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:29<00:00, 3.18track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:27<00:00, 3.43track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 190/190 [01:01<00:00, 3.08track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 93/93 [00:28<00:00, 3.29track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 197/197 [00:59<00:00, 3.31track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 192/192 [00:59<00:00, 3.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:37<00:00, 2.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 192/192 [00:55<00:00, 3.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:26<00:00, 3.62track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:27<00:00, 3.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 188/188 [00:54<00:00, 3.44track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:28<00:00, 3.39track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 92/92 [00:28<00:00, 3.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:29<00:00, 3.30track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:27<00:00, 3.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 97/97 [00:29<00:00, 3.34track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:26<00:00, 3.66track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:55<00:00, 3.49track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:53<00:00, 3.62track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 94/94 [00:27<00:00, 3.41track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 188/188 [00:51<00:00, 3.62track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:56<00:00, 3.41track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 193/193 [00:54<00:00, 3.52track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:57<00:00, 3.30track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 196/196 [00:57<00:00, 3.43track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 95/95 [00:25<00:00, 3.67track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 98/98 [00:34<00:00, 2.82track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 188/188 [00:56<00:00, 3.35track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:28<00:00, 3.34track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 96/96 [00:29<00:00, 3.22track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 191/191 [00:58<00:00, 3.29track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 82/82 [00:25<00:00, 3.27track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 5/5 [00:00<00:00, 649.53track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 16/16 [00:00<00:00, 2081.48track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 4/4 [00:00<00:00, 1143.17track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 16/16 [00:00<00:00, 2154.59track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 49/49 [00:10<00:00, 4.51track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 36/36 [00:10<00:00, 3.49track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 19/19 [00:06<00:00, 2.76track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 24/24 [00:07<00:00, 3.36track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 20/20 [00:06<00:00, 2.99track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 33/33 [00:09<00:00, 3.33track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 30/30 [00:08<00:00, 3.72track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 10/10 [00:02<00:00, 3.87track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 2/2 [00:00<00:00, 439.26track/s]\n",
|
||||
"Downloading previews: 100%|██████████| 1/1 [00:00<00:00, 5.52track/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 29\u001b[39m\n\u001b[32m 26\u001b[39m df_new = df[~df[\u001b[33m'\u001b[39m\u001b[33mtrack_id\u001b[39m\u001b[33m'\u001b[39m].isin(processed)].copy()\n\u001b[32m 27\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m df_new.empty:\n\u001b[32m 28\u001b[39m \u001b[38;5;66;03m# nothing new → wait and retry\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m29\u001b[39m time.sleep(SLEEP_INTERVAL)\n\u001b[32m 30\u001b[39m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[32m 32\u001b[39m \u001b[38;5;66;03m# 3) Download each new preview with a progress bar\u001b[39;00m\n",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import time\n",
|
||||
"import requests\n",
|
||||
"import pandas as pd\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"CSV_PATH = './track_genre_balanced_url.csv'\n",
|
||||
"DOWNLOAD_DIR = 'audio_previews'\n",
|
||||
"SLEEP_INTERVAL = 60 # seconds to wait between checks\n",
|
||||
"\n",
|
||||
"os.makedirs(DOWNLOAD_DIR, exist_ok=True)\n",
|
||||
"\n",
|
||||
"# Keep track of which track_ids we've already attempted\n",
|
||||
"processed = set()\n",
|
||||
"\n",
|
||||
"while True:\n",
|
||||
" # 1) Load current CSV\n",
|
||||
" try:\n",
|
||||
" df = pd.read_csv(CSV_PATH)\n",
|
||||
" except FileNotFoundError:\n",
|
||||
" print(f\"{CSV_PATH} not found, waiting...\")\n",
|
||||
" time.sleep(SLEEP_INTERVAL)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # 2) Identify new tracks we haven't processed yet\n",
|
||||
" df_new = df[~df['track_id'].isin(processed)].copy()\n",
|
||||
" if df_new.empty:\n",
|
||||
" # nothing new → wait and retry\n",
|
||||
" time.sleep(SLEEP_INTERVAL)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # 3) Download each new preview with a progress bar\n",
|
||||
" for _, row in tqdm(df_new.iterrows(),\n",
|
||||
" total=len(df_new),\n",
|
||||
" desc=\"Downloading previews\",\n",
|
||||
" unit=\"track\"):\n",
|
||||
" track_id = row['track_id']\n",
|
||||
" preview_url = row['preview']\n",
|
||||
" out_path = os.path.join(DOWNLOAD_DIR, f\"{track_id}.mp3\")\n",
|
||||
"\n",
|
||||
" # mark as processed so we don't retry on crashes\n",
|
||||
" processed.add(track_id)\n",
|
||||
"\n",
|
||||
" # skip if file already exists\n",
|
||||
" if os.path.exists(out_path):\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" # attempt download\n",
|
||||
" try:\n",
|
||||
" resp = requests.get(preview_url, timeout=30)\n",
|
||||
" if resp.status_code == 200:\n",
|
||||
" with open(out_path, 'wb') as f:\n",
|
||||
" f.write(resp.content)\n",
|
||||
" else:\n",
|
||||
" print(f\"HTTP {resp.status_code} for {track_id}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error downloading {track_id}: {e}\")\n",
|
||||
"\n",
|
||||
" # 4) Pause before next check\n",
|
||||
" time.sleep(SLEEP_INTERVAL)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,297 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Read out Data from Kaggle Dataset, get preview URL-s and save to file\n",
|
||||
"\n",
|
||||
"## this should be run only once"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 6%|▋ | 76/1183 [00:35<33:39, 1.82s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/64ffsubBonytxZc5fQJhdO\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 9%|▊ | 102/1183 [00:55<34:18, 1.90s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2Iu5wxKFiEEQDQK1Pldsis\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 9%|▉ | 111/1183 [01:03<33:10, 1.86s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/6syvS9gZzjB8b9DdKVhAJH\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 15%|█▌ | 180/1183 [01:54<53:30, 3.20s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2qrVR11O44iJ0DVTNCExjA\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 19%|█▉ | 225/1183 [02:25<29:37, 1.86s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3njPW0vttbjt5j1Elt6sJI\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 32%|███▏ | 381/1183 [03:26<23:39, 1.77s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3T7zNYia3nk9d8uXhO9Xud\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 53%|█████▎ | 630/1183 [05:23<16:28, 1.79s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/41Sfs0E8hr8w2BvzUtof4O\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 54%|█████▎ | 633/1183 [05:29<20:57, 2.29s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/3H9aA6IO5gfHW72m8YU8Iv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 57%|█████▋ | 675/1183 [05:56<15:49, 1.87s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/0lvHnw9Exl8jLV3zuRsksJ\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 67%|██████▋ | 792/1183 [07:06<12:08, 1.86s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/17sSDGIRIkB0jOKb2cBURf\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 77%|███████▋ | 911/1183 [08:03<09:15, 2.04s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/5RcZ5jbBgKDdM6BuoSeh8P\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 77%|███████▋ | 912/1183 [08:08<13:32, 3.00s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/0YQrHOpi219lZA8SDly4iG\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 90%|█████████ | 1069/1183 [09:31<03:31, 1.85s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Failed to fetch Spotify preview URL: 504 Server Error: Gateway Timeout for url: https://open.spotify.com/embed/track/2iql0ydkQX1hZ375EyRFFF\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fetching previews: 100%|██████████| 1183/1183 [10:19<00:00, 1.91it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from spotify_preview import get_spotify_preview_url\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n",
|
||||
"# --- 0) Load & dedupe your balanced track/genre file ---\n",
|
||||
"df = pd.read_csv('track_genres_balanced.csv')\n",
|
||||
"df = df.drop_duplicates(subset=['track_id'])\n",
|
||||
"df = df.dropna(subset=['genre'])\n",
|
||||
"\n",
|
||||
"# --- 1) Prep output CSV (header only once) ---\n",
|
||||
"#output_csv = 'track_genre_balanced_url.csv'\n",
|
||||
"#pd.DataFrame(columns=['track_id','genre','preview']).to_csv(output_csv, index=False)\n",
|
||||
"#output_csv = pd.read_csv('track_genre_balanced_url.csv')\n",
|
||||
"\n",
|
||||
"output_csv = 'track_genre_balanced_url.csv'\n",
|
||||
"\n",
|
||||
"if os.path.exists(output_csv):\n",
|
||||
" # load already-fetched track_ids and drop them from df\n",
|
||||
" done = pd.read_csv(output_csv, usecols=['track_id'])\n",
|
||||
" processed_ids = set(done['track_id'].astype(str))\n",
|
||||
" df = df[~df['track_id'].astype(str).isin(processed_ids)]\n",
|
||||
" write_header = False\n",
|
||||
"else:\n",
|
||||
" # new file → write header\n",
|
||||
" pd.DataFrame(columns=['track_id','genre','preview']) \\\n",
|
||||
" .to_csv(output_csv, index=False)\n",
|
||||
" write_header = False # header is already there\n",
|
||||
"\n",
|
||||
"# --- 2) Parameters ---\n",
|
||||
"BATCH_SIZE = 100 # how many tracks to process per “mini‐batch”\n",
|
||||
"PAUSE = 0.1 # if you want a small sleep between API calls\n",
|
||||
"\n",
|
||||
"# --- 3) Loop with a single progress bar over all tracks ---\n",
|
||||
"with tqdm(total=len(df), desc=\"Fetching previews\") as pbar:\n",
|
||||
" for start in range(0, len(df), BATCH_SIZE):\n",
|
||||
" chunk = df.iloc[start:start + BATCH_SIZE]\n",
|
||||
" rows = []\n",
|
||||
"\n",
|
||||
" # 4) Per‐track lookup\n",
|
||||
" for _, row in chunk.iterrows():\n",
|
||||
" track_id = row['track_id']\n",
|
||||
" genre = row['genre']\n",
|
||||
"\n",
|
||||
" preview = get_spotify_preview_url(track_id)\n",
|
||||
" if preview:\n",
|
||||
" rows.append({\n",
|
||||
" 'track_id': track_id,\n",
|
||||
" 'genre': genre,\n",
|
||||
" 'preview': preview\n",
|
||||
" })\n",
|
||||
" # else: silently skip or print an error if you prefer\n",
|
||||
"\n",
|
||||
" pbar.update(1)\n",
|
||||
" if PAUSE:\n",
|
||||
" import time; time.sleep(PAUSE)\n",
|
||||
"\n",
|
||||
" # 5) Append this batch’s hits to disk\n",
|
||||
" if rows:\n",
|
||||
" pd.DataFrame(rows).to_csv(\n",
|
||||
" output_csv,\n",
|
||||
" mode='a',\n",
|
||||
" header=False,\n",
|
||||
" index=False\n",
|
||||
" )\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
+25
-4
@@ -1,6 +1,5 @@
|
||||
import base64
|
||||
import json
|
||||
import logging as log
|
||||
import os
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
@@ -9,7 +8,11 @@ from urllib.parse import parse_qs, urlencode, urlparse
|
||||
import dotenv
|
||||
import requests
|
||||
|
||||
TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env', 'tokens.json')
|
||||
from logger import LoggerWrapper
|
||||
|
||||
TOKEN_FILE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'tokens.json')
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def simple_authenticate(grant_type: str = "client_credentials") -> str:
|
||||
@@ -32,13 +35,17 @@ def simple_authenticate(grant_type: str = "client_credentials") -> str:
|
||||
"grant_type": f"{grant_type}"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(token_url, headers=headers, data=data)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error authenticating: {e}")
|
||||
return None
|
||||
|
||||
if response.status_code == 200:
|
||||
access_token = response.json().get('access_token')
|
||||
return access_token
|
||||
else:
|
||||
log.error(f"Error {response.status_code}: {response.text}")
|
||||
log.error(f"Error authenticating {response.status_code}: {response.text}")
|
||||
|
||||
|
||||
def authenticate(scope: str) -> str:
|
||||
@@ -101,10 +108,14 @@ def _read_env_file() -> tuple:
|
||||
|
||||
:return: tuple
|
||||
"""
|
||||
try:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
dotenv_folder_path = os.path.join(current_dir, 'env')
|
||||
dotenv_folder_path = os.path.join(current_dir, '../config')
|
||||
dotenv_path = os.path.join(dotenv_folder_path, '.env')
|
||||
contents = dotenv.dotenv_values(dotenv_path=dotenv_path)
|
||||
except Exception as e:
|
||||
log.error(f"Error reading the .env file: {e}")
|
||||
return None
|
||||
spotify_client_id = contents['SPOTIFY_CLIENT_ID']
|
||||
spotify_client_secret = contents['SPOTIFY_CLIENT_SECRET']
|
||||
spotify_redirect_uri = contents['SPOTIFY_REDIRECT_URI']
|
||||
@@ -158,7 +169,12 @@ def _exchange_code_for_token(code: str, redirect_uri: str, client_id: str, clien
|
||||
'client_secret': client_secret,
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(token_url, data=data, headers=headers)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error exchanging code for token: {e}")
|
||||
return None
|
||||
|
||||
response_data = response.json()
|
||||
|
||||
if 'access_token' not in response_data:
|
||||
@@ -192,7 +208,12 @@ def _refresh_access_token(refresh_token: str, client_id: str, client_secret: str
|
||||
'client_secret': client_secret,
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(token_url, data=data, headers=headers)
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error refreshing access token: {e}")
|
||||
return None
|
||||
|
||||
response_data = response.json()
|
||||
|
||||
if 'access_token' not in response_data:
|
||||
|
||||
+31
-8
@@ -1,7 +1,12 @@
|
||||
import logging as log
|
||||
import sqlite3
|
||||
from enum import Enum
|
||||
|
||||
from logger import LoggerWrapper
|
||||
|
||||
# DATABASE_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'spotify_scraped.db')
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
class Table(Enum):
|
||||
TRACK_INFORMATION = "track_information"
|
||||
@@ -16,7 +21,7 @@ class Database:
|
||||
A class to handle the database connection and operations
|
||||
"""
|
||||
|
||||
def __init__(self, db_name):
|
||||
def __init__(self, db_name: str):
|
||||
"""Initialize the connection to the database"""
|
||||
self.db_name = db_name
|
||||
self.conn = sqlite3.connect(db_name)
|
||||
@@ -60,8 +65,18 @@ class Database:
|
||||
self.cursor.execute(f'''
|
||||
CREATE TABLE IF NOT EXISTS {Table.TRACK_ATTRIBUTES.value} (
|
||||
track_id TEXT PRIMARY KEY,
|
||||
attribute_name TEXT,
|
||||
attribute_value TEXT
|
||||
acousticness FLOAT,
|
||||
danceability FLOAT,
|
||||
duration_ms INTEGER,
|
||||
energy FLOAT,
|
||||
instrumentalness FLOAT,
|
||||
key INTEGER,
|
||||
liveness FLOAT,
|
||||
loudness FLOAT,
|
||||
speechiness FLOAT,
|
||||
tempo FLOAT,
|
||||
time_signature INTEGER,
|
||||
valence FLOAT
|
||||
);
|
||||
''')
|
||||
|
||||
@@ -73,12 +88,14 @@ class Database:
|
||||
album_id TEXT,
|
||||
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_INFORMATION.value}(track_id),
|
||||
FOREIGN KEY (artist_id) REFERENCES {Table.ARTIST_INFORMATION.value}(artist_id),
|
||||
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id)
|
||||
FOREIGN KEY (album_id) REFERENCES {Table.ALBUM_INFORMATION.value}(album_id),
|
||||
FOREIGN KEY (track_id) REFERENCES {Table.TRACK_ATTRIBUTES.value}(track_id)
|
||||
);
|
||||
''')
|
||||
|
||||
# Commit the changes
|
||||
self.conn.commit()
|
||||
log.debug("Initialised tables")
|
||||
|
||||
def add_row(self, table: Table, values):
|
||||
"""Add a new row into the specified table"""
|
||||
@@ -88,17 +105,22 @@ class Database:
|
||||
self.cursor.execute(query, values)
|
||||
self.conn.commit()
|
||||
except Exception as e:
|
||||
log.debug(f"Error: {e}")
|
||||
log.error(f"Error while inserting row into table {table.value}: {e}")
|
||||
|
||||
def read_all_rows(self, table: Table, column: str = "*"):
|
||||
"""Read all rows from the specified table"""
|
||||
try:
|
||||
self.cursor.execute(f"SELECT {column} FROM {table.value}")
|
||||
rows = self.cursor.fetchall()
|
||||
return rows
|
||||
except Exception as e:
|
||||
log.error(f"Error while reading all rows from table {table.value}: {e}")
|
||||
return []
|
||||
|
||||
def close(self):
|
||||
def close(self, message: str):
|
||||
"""Close the database connection"""
|
||||
self.conn.close()
|
||||
log.info(f"Database connection closed from file: {message}")
|
||||
|
||||
def get_total_overview(self) -> list:
|
||||
"""Retrieve a total overview of all recently played songs with full details"""
|
||||
@@ -122,5 +144,6 @@ class Database:
|
||||
rows = self.cursor.fetchall()
|
||||
return rows
|
||||
except Exception as e:
|
||||
log.error(f"Error retrieving total overview: {e}")
|
||||
log.error(f"Error retrieving total overview: {e}"
|
||||
f"\nQuery Executed: {query}")
|
||||
return []
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,149 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from auth import simple_authenticate
|
||||
from database_handler import Database, Table
|
||||
from logger import LoggerWrapper
|
||||
from spotify_api import get_multiple_field_information
|
||||
|
||||
# Define the absolute folder path to the folder containing the gdrp retrieved data
|
||||
folder_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'gdpr_data')
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def _read_gdrp_data() -> list:
|
||||
"""
|
||||
This function reads all .json files in the folder containing the gdpr data.
|
||||
This data is then extracted into a dict and sorted by timestamp ascending.
|
||||
|
||||
:return: all_songs_played: A dict with an items field containing all songs played for the user
|
||||
"""
|
||||
all_songs_played = []
|
||||
try:
|
||||
for filename in os.listdir(folder_path):
|
||||
|
||||
if filename.endswith('.json'):
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
|
||||
with open(file_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
for entry in data:
|
||||
# This removes all podcasts from the list
|
||||
if entry['spotify_track_uri'] is None:
|
||||
continue
|
||||
try:
|
||||
track = {
|
||||
'timestamp': entry['ts'],
|
||||
'id': _extract_id(entry['spotify_track_uri']),
|
||||
'track_name': entry['master_metadata_track_name'],
|
||||
'artist_name': entry['master_metadata_album_artist_name'],
|
||||
'album_name': entry['master_metadata_album_album_name'],
|
||||
'conn_country': entry['conn_country'],
|
||||
'ms_played': entry['ms_played']
|
||||
}
|
||||
all_songs_played.append(track)
|
||||
except Exception as e:
|
||||
log.warning(f'Missing field from gdpr data: {e}')
|
||||
except Exception as e:
|
||||
log.error(f'Failed to read gdpr data: {e}')
|
||||
|
||||
all_songs_played = sorted(all_songs_played, key=lambda x: x['timestamp'])
|
||||
return all_songs_played
|
||||
|
||||
|
||||
def _extract_id(spotify_id: str) -> str:
|
||||
"""
|
||||
This function gets a id with extra details and extracts the id from it.
|
||||
|
||||
:param: id a string containing the id
|
||||
:return: str the ID
|
||||
"""
|
||||
prefix = "spotify:track:"
|
||||
prefix_removed_id = spotify_id[len(prefix):]
|
||||
return prefix_removed_id
|
||||
|
||||
|
||||
def _populate_ids(all_songs_played: list):
|
||||
|
||||
track_ids = []
|
||||
all_songs_played_info = []
|
||||
token = simple_authenticate()
|
||||
|
||||
processed_songs_id = set()
|
||||
|
||||
counter = 0
|
||||
|
||||
for entry in all_songs_played:
|
||||
track_id = entry['id']
|
||||
|
||||
if track_id not in processed_songs_id:
|
||||
track_ids.append(track_id)
|
||||
processed_songs_id.add(track_id)
|
||||
counter += 1
|
||||
|
||||
if (counter + 1) % 50 == 0 and len(track_ids) > 0:
|
||||
track_ids_tuple = tuple(track_ids)
|
||||
track_ids.clear()
|
||||
response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple)
|
||||
all_songs_played_info.extend(_sort_and_create_required_dataset(response))
|
||||
counter = 0
|
||||
|
||||
if len(track_ids) > 0:
|
||||
track_ids_tuple = tuple(track_ids)
|
||||
response = get_multiple_field_information(token, 'tracks', 50, *track_ids_tuple)
|
||||
all_songs_played_info.extend(_sort_and_create_required_dataset(response))
|
||||
|
||||
return all_songs_played_info
|
||||
|
||||
|
||||
def _sort_and_create_required_dataset(response) -> dict:
|
||||
|
||||
track_list = []
|
||||
|
||||
for entry in response['tracks']:
|
||||
track_data = {
|
||||
'track_id': entry['id'],
|
||||
'album_id': entry['album']['id'],
|
||||
'artist_id': entry['artists'][0]['id']
|
||||
}
|
||||
track_list.append(track_data)
|
||||
|
||||
return track_list
|
||||
|
||||
|
||||
def _fill_missing_ids(all_songs_played, all_songs_catalogued):
|
||||
|
||||
# Create a dictionary to map track_id to artist_id and album_id
|
||||
track_id_to_artist_album = {data['track_id']: {'album_id': data['album_id'], 'artist_id': data['artist_id']} for data in all_songs_catalogued}
|
||||
|
||||
# Now, we will update the original `tracks` list by adding artist_id and album_id
|
||||
for track in all_songs_played:
|
||||
track_info = track_id_to_artist_album.get(track['id'])
|
||||
if track_info:
|
||||
track['artist_id'] = track_info['artist_id']
|
||||
track['album_id'] = track_info['album_id']
|
||||
|
||||
return all_songs_played
|
||||
|
||||
|
||||
def _insert_data_into_db(db: Database, all_songs_played: list):
|
||||
"""
|
||||
This function takes a list of all played songs and inserts these into the database.
|
||||
|
||||
:param: all_songs_played list of all songs
|
||||
"""
|
||||
for entry in all_songs_played:
|
||||
try:
|
||||
db.add_row(Table.RECENTLY_PLAYED, (entry['timestamp'], entry['id'], entry['artist_id'], entry['album_id']))
|
||||
except Exception as e:
|
||||
log.error(f'Failed adding {entry} to database, error {e}')
|
||||
|
||||
|
||||
def export_gdpr_data(db: Database, n_limit: int = 100) -> None:
|
||||
all_songs_played = _read_gdrp_data()
|
||||
all_songs_played = all_songs_played[-n_limit:]
|
||||
all_songs_catalogued = _populate_ids(all_songs_played)
|
||||
all_songs_played = _fill_missing_ids(all_songs_played, all_songs_catalogued)
|
||||
_insert_data_into_db(db, all_songs_played)
|
||||
@@ -0,0 +1,61 @@
|
||||
import logging
|
||||
import os
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class LoggerWrapper():
|
||||
|
||||
def __init__(self, logger_name: str = "standard_logger"):
|
||||
self.logger = logging.getLogger(logger_name)
|
||||
if not self.logger.handlers:
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
self.setup_logger()
|
||||
|
||||
def set_console_handler_to_debug(self):
|
||||
for handler in self.logger.handlers:
|
||||
if isinstance(handler, logging.StreamHandler):
|
||||
handler.setLevel(logging.DEBUG)
|
||||
|
||||
def setup_logger(self):
|
||||
# Define and create folder
|
||||
logs_folder = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'logs')
|
||||
Path(logs_folder).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Define file path
|
||||
log_file = log_file = os.path.join(logs_folder, 'predictify.log')
|
||||
|
||||
# Setup File Handler
|
||||
handler = RotatingFileHandler(log_file, maxBytes=1000000, backupCount=5)
|
||||
handler.setLevel(logging.DEBUG)
|
||||
|
||||
# Setup Console Handler
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.WARNING)
|
||||
|
||||
# Setup Formatter
|
||||
formatter = logging.Formatter('%(asctime)s - [%(filename)s:%(lineno)d] - %(levelname)s - %(message)s')
|
||||
|
||||
# Add Formatters to Handlers
|
||||
handler.setFormatter(formatter)
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# Add Handlers to Logger
|
||||
self.logger.addHandler(handler)
|
||||
self.logger.addHandler(console_handler)
|
||||
|
||||
def info(self, message):
|
||||
self.logger.info(message)
|
||||
|
||||
def debug(self, message):
|
||||
self.logger.debug(message)
|
||||
|
||||
def warning(self, message):
|
||||
self.logger.warning(message)
|
||||
|
||||
def error(self, message):
|
||||
self.logger.error(message)
|
||||
|
||||
def critical(self, message):
|
||||
self.logger.critical(message)
|
||||
# Here we can add alerting/handling
|
||||
+70
-3
@@ -1,8 +1,75 @@
|
||||
import argparse
|
||||
import atexit
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
from time import sleep
|
||||
|
||||
from scraper import scraping
|
||||
from database_handler import Database
|
||||
from gdpr_export import export_gdpr_data
|
||||
from logger import LoggerWrapper
|
||||
from scraper import scrape_missing_infos, scraping
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def _handle_exit():
|
||||
"""
|
||||
Function to log exit information if the script ends unexpectedly.
|
||||
"""
|
||||
log.critical("Script terminated unexpectedly.")
|
||||
|
||||
|
||||
def _log_crash_info(exc_type, exc_value, exc_tb):
|
||||
"""Custom function to log crash info when an exception occurs."""
|
||||
log.critical("A critical error occurred!", exc_info=(exc_type, exc_value, exc_tb))
|
||||
log.critical("Exception type: %s", exc_type)
|
||||
log.critical("Exception message: %s", exc_value)
|
||||
log.critical("Stack trace:\n%s", ''.join(traceback.format_tb(exc_tb)))
|
||||
|
||||
|
||||
# Register the exit handler and excepthook
|
||||
atexit.register(_handle_exit)
|
||||
sys.excepthook = _log_crash_info
|
||||
|
||||
|
||||
# Initialize the parser
|
||||
parser = argparse.ArgumentParser(description="A python script written in Python3.13 which continuously checks what spotify songs "
|
||||
"the user is listening to and logging these in a local database. \n"
|
||||
"The Script also has a export function where it can read out the gdpr data exported by the user.")
|
||||
|
||||
# Add optional arguments
|
||||
parser.add_argument('--verbose', '-v', action='store_true', help="Enable verbose output")
|
||||
parser.add_argument('--export', type=str, choices=['TEST', 'PRODUCTION'], required=True,
|
||||
help="Export the gdpr data from spotify if not done already. Choose between TEST and PRODUCTION."
|
||||
"TEST will export only a small number of songs, PRODUCTION will export all songs.")
|
||||
|
||||
# Parse the arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.verbose:
|
||||
log.set_console_handler_to_debug()
|
||||
log.info('Enabled verbose mode')
|
||||
|
||||
db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data', f'spotify_scrape_{args.export}.db')
|
||||
|
||||
if args.export == 'TEST':
|
||||
export_size = 10000
|
||||
log.info(f'Scraping GDPR Data. Sample size: {export_size}')
|
||||
db = Database(db_path)
|
||||
export_gdpr_data(db, export_size)
|
||||
scrape_missing_infos(db)
|
||||
elif args.export == 'PRODUCTION':
|
||||
export_size = 1000000
|
||||
log.info('Scraping all GDPR Data.')
|
||||
db = Database(db_path)
|
||||
export_gdpr_data(db, export_size)
|
||||
scrape_missing_infos(db)
|
||||
else:
|
||||
raise ValueError('Invalid export type. Please choose between TEST and PRODUCTION.')
|
||||
|
||||
# Run forever on intervals of 30 minutes
|
||||
while True:
|
||||
scraping()
|
||||
log.info('Scraping API...')
|
||||
scraping(db)
|
||||
log.info('Done scraping API. Sleeping for 30 minutes...')
|
||||
sleep(1800)
|
||||
|
||||
+95
-114
@@ -1,150 +1,131 @@
|
||||
import requests
|
||||
|
||||
from auth import authenticate, simple_authenticate
|
||||
from database_handler import Database, Table
|
||||
from logger import LoggerWrapper
|
||||
from spotify_api import get_last_played_track, get_multiple_field_information
|
||||
|
||||
db = Database('spotify_scraped.db')
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def scraping():
|
||||
def scraping(db: Database) -> None:
|
||||
"""
|
||||
This function is the main function that will be executed when the script is run
|
||||
"""
|
||||
global db
|
||||
|
||||
scope = "user-read-recently-played"
|
||||
bearer_token = authenticate(scope)
|
||||
|
||||
# Once each 30 mins
|
||||
_read_recently_played_page_and_add_to_db(bearer_token=bearer_token)
|
||||
_scrape_missing_infos()
|
||||
|
||||
db.close()
|
||||
_read_recently_played_page_and_add_to_db(db, bearer_token)
|
||||
scrape_missing_infos(db)
|
||||
|
||||
|
||||
def _read_recently_played_page_and_add_to_db(bearer_token: str):
|
||||
def _read_recently_played_page_and_add_to_db(db: Database, bearer_token: str) -> None:
|
||||
"""
|
||||
This function gets a list of song play history and adds it into the database.
|
||||
"""
|
||||
global db
|
||||
|
||||
last_played_track = _get_last_played_track(bearer_token=bearer_token)
|
||||
last_played_track = get_last_played_track(bearer_token=bearer_token)
|
||||
|
||||
for track in last_played_track['items']:
|
||||
try:
|
||||
for track in reversed(last_played_track['items']):
|
||||
track_id = track['track']['id']
|
||||
played_at = track['played_at']
|
||||
album_id = track['track']['album']['id']
|
||||
artist_id = track['track']['artists'][0]['id']
|
||||
db.add_row(Table.RECENTLY_PLAYED, (played_at, track_id, artist_id, album_id))
|
||||
except Exception as e:
|
||||
log.error(f"Failed to add returned play history to database: {e}"
|
||||
f"\nReturned Value: {last_played_track}")
|
||||
|
||||
|
||||
def _get_last_played_track(url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50", bearer_token: str = "") -> dict:
|
||||
"""
|
||||
This function returns the last played track based on the limit size
|
||||
|
||||
:param limit: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
def scrape_missing_infos(db: Database) -> None:
|
||||
"""
|
||||
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
|
||||
|
||||
def _get_track_information(track_id: str, bearer_token: str) -> dict:
|
||||
"""
|
||||
This function returns the track information based on the track id
|
||||
|
||||
:param track_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/tracks/{track_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
|
||||
|
||||
def _get_artist_information(artist_id: str, bearer_token: str) -> dict:
|
||||
"""
|
||||
This function returns the artist information based on the artist id
|
||||
|
||||
:param artist_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/artists/{artist_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
|
||||
|
||||
def _get_album_information(album_id: str, bearer_token: str) -> dict:
|
||||
"""
|
||||
This function returns the album information based on the album id
|
||||
|
||||
:param album_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/albums/{album_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
|
||||
|
||||
def _scrape_missing_infos():
|
||||
"""
|
||||
"""
|
||||
global db
|
||||
|
||||
bearer_token_simple = simple_authenticate()
|
||||
|
||||
# Track Info
|
||||
all_track_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'track_id')
|
||||
all_track_ids_saved = db.read_all_rows(Table.TRACK_INFORMATION, 'track_id')
|
||||
all_track_ids_missing = list(set(all_track_ids_recently_played) - set(all_track_ids_saved))
|
||||
for track_id in all_track_ids_missing:
|
||||
response = _get_track_information(track_id=track_id[0], bearer_token=bearer_token_simple)
|
||||
db.add_row(Table.TRACK_INFORMATION, (response['id'], response['name'], response['duration_ms'], response['explicit'], response['popularity']))
|
||||
# Album Info
|
||||
all_album_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'album_id')
|
||||
all_album_ids_saved = db.read_all_rows(Table.ALBUM_INFORMATION, 'album_id')
|
||||
all_album_ids_missing = list(set(all_album_ids_recently_played) - set(all_album_ids_saved))
|
||||
for album_id in all_album_ids_missing:
|
||||
response = _get_album_information(album_id=album_id[0], bearer_token=bearer_token_simple)
|
||||
_process_missing_info(db, bearer_token_simple, Table.TRACK_INFORMATION, 'track_id', 'tracks')
|
||||
_process_missing_info(db, bearer_token_simple, Table.ALBUM_INFORMATION, 'album_id', 'albums')
|
||||
_process_missing_info(db, bearer_token_simple, Table.ARTIST_INFORMATION, 'artist_id', 'artists')
|
||||
# _process_missing_info(db, bearer_token_simple, Table.TRACK_ATTRIBUTES, 'track_id', 'audio-features')
|
||||
|
||||
|
||||
def _process_missing_info(db: Database, bearer_token_simple: str, table_name: Table, id_field_name: str, endpoint_name: str) -> None:
|
||||
|
||||
if endpoint_name == 'albums':
|
||||
limit = 20
|
||||
elif endpoint_name == 'audio-features':
|
||||
limit = 100
|
||||
else:
|
||||
limit = 50
|
||||
|
||||
all_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, id_field_name)
|
||||
all_ids_saved = db.read_all_rows(table_name, id_field_name)
|
||||
all_ids_missing = list(set(all_ids_recently_played) - set(all_ids_saved))
|
||||
|
||||
log.debug(f"Number of missing {table_name.name} entries: {len(all_ids_missing)}. Inserting...")
|
||||
|
||||
ids = []
|
||||
processed_ids = set()
|
||||
|
||||
counter = 0
|
||||
|
||||
for id_value in all_ids_missing:
|
||||
|
||||
id_value_str = id_value[0]
|
||||
|
||||
if id_value_str not in processed_ids:
|
||||
ids.append(id_value_str)
|
||||
processed_ids.add(id_value_str)
|
||||
counter += 1
|
||||
|
||||
if (counter + 1) % limit == 0 and len(ids) > 0:
|
||||
ids_tuple = tuple(ids)
|
||||
ids.clear()
|
||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||
_add_data_to_database(db, table_name, response)
|
||||
counter = 0
|
||||
|
||||
if len(ids) > 0:
|
||||
ids_tuple = tuple(ids)
|
||||
ids.clear()
|
||||
response = get_multiple_field_information(bearer_token_simple, endpoint_name, limit, *ids_tuple)
|
||||
_add_data_to_database(db, table_name, response)
|
||||
|
||||
|
||||
def _add_data_to_database(db: Database, table_name: Table, response) -> None:
|
||||
|
||||
if table_name == Table.TRACK_INFORMATION:
|
||||
log.debug('Adding track information to database')
|
||||
for entry in response['tracks']:
|
||||
log.debug(f"Adding track: {entry['name']}")
|
||||
db.add_row(table_name, (entry['id'], entry['name'], entry['duration_ms'], entry['explicit'], entry['popularity']))
|
||||
|
||||
elif table_name == Table.ALBUM_INFORMATION:
|
||||
log.debug('Adding album information to database')
|
||||
for entry in response['albums']:
|
||||
log.debug(f"Adding album: {entry['name']}")
|
||||
try:
|
||||
release_year = response['release_date'][:4]
|
||||
release_year = entry['release_date'][:4]
|
||||
except Exception:
|
||||
release_year = ""
|
||||
db.add_row(Table.ALBUM_INFORMATION, (response['id'], response['name'], response['album_type'], response['total_tracks'], release_year, response['label']))
|
||||
# Artist Info
|
||||
all_artist_ids_recently_played = db.read_all_rows(Table.RECENTLY_PLAYED, 'artist_id')
|
||||
all_artist_ids_saved = db.read_all_rows(Table.ARTIST_INFORMATION, 'artist_id')
|
||||
all_artist_ids_missing = list(set(all_artist_ids_recently_played) - set(all_artist_ids_saved))
|
||||
for artist_id in all_artist_ids_missing:
|
||||
response = _get_artist_information(artist_id=artist_id[0], bearer_token=bearer_token_simple)
|
||||
db.add_row(table_name, (entry['id'], entry['name'], entry['album_type'], entry['total_tracks'], release_year, entry['label']))
|
||||
|
||||
elif table_name == Table.ARTIST_INFORMATION:
|
||||
log.debug('Adding artist information to database')
|
||||
for entry in response['artists']:
|
||||
log.debug(f"Adding artist: {entry['name']}")
|
||||
try:
|
||||
genre = response['genres'][0]
|
||||
genre = entry['genres'][0]
|
||||
except IndexError:
|
||||
genre = ""
|
||||
db.add_row(Table.ARTIST_INFORMATION, (response['id'], response['name'], response['followers']['total'], genre, response['popularity']))
|
||||
db.add_row(Table.ARTIST_INFORMATION, (entry['id'], entry['name'], entry['followers']['total'], genre, entry['popularity']))
|
||||
|
||||
elif table_name == Table.TRACK_ATTRIBUTES:
|
||||
log.debug('Adding track attributes to database')
|
||||
for entry in response['audio_features']:
|
||||
log.debug(f"Adding track attributes: {entry['id']}")
|
||||
try:
|
||||
db.add_row(Table.TRACK_ATTRIBUTES, (entry['id'], entry['aucousticness'], entry['danceability'], entry['duration_ms'], entry['energy'], entry['instrumentalness'], entry['key'], entry['liveness'], entry['loudness'], entry['speechiness'], entry['tempo'], entry['time_signature'], entry['valence']))
|
||||
except Exception as e:
|
||||
log.error(f"Failed to add track attributes to database: {e}"
|
||||
f"\nReturned Value: {response}")
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
from typing import Union
|
||||
|
||||
import requests
|
||||
|
||||
from logger import LoggerWrapper
|
||||
|
||||
log = LoggerWrapper()
|
||||
|
||||
|
||||
def get_last_played_track(bearer_token: str, url: str = "https://api.spotify.com/v1/me/player/recently-played?limit=50") -> Union[dict, None]:
|
||||
"""
|
||||
This function returns the last played track based on the limit size
|
||||
|
||||
:param limit: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
try:
|
||||
log.debug(f"GET Request: {url}")
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error in get_last_played_track: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_track_information(track_id: str, bearer_token: str) -> Union[dict, None]:
|
||||
"""
|
||||
This function returns the track information based on the track id
|
||||
|
||||
:param track_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/tracks/{track_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
try:
|
||||
log.debug(f"GET Request: {url}")
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error in get_track_information: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_artist_information(artist_id: str, bearer_token: str) -> Union[dict, None]:
|
||||
"""
|
||||
This function returns the artist information based on the artist id
|
||||
|
||||
:param artist_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/artists/{artist_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
try:
|
||||
log.debug(f"GET Request: {url}")
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error in get_artist_information: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_album_information(album_id: str, bearer_token: str) -> Union[dict, None]:
|
||||
"""
|
||||
This function returns the album information based on the album id
|
||||
|
||||
:param album_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
url = f"https://api.spotify.com/v1/albums/{album_id}"
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
try:
|
||||
log.debug(f"GET Request: {url}")
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error in get_album_information: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_multiple_field_information(bearer_token: str, api_type: str, limit: int, *track_ids) -> Union[dict, None]:
|
||||
"""
|
||||
This function returns the track information based on the track id
|
||||
|
||||
:param *track_id: str
|
||||
:param bearer_token: str
|
||||
:return: dict
|
||||
"""
|
||||
|
||||
if len(track_ids) > limit:
|
||||
log.error(f'exceeding the limit if ids {limit} for endpoint {api_type}')
|
||||
return None
|
||||
|
||||
url_suffix = "ids="
|
||||
separator = ","
|
||||
try:
|
||||
for track_id in track_ids:
|
||||
url_suffix = url_suffix + track_id + separator
|
||||
except Exception as e:
|
||||
log.error(f"Failed setting up the url for multiple ids request."
|
||||
f"Error: {e}")
|
||||
return None
|
||||
|
||||
url = f"https://api.spotify.com/v1/{api_type}?{url_suffix}"
|
||||
url = url[:-len(separator)]
|
||||
header = {
|
||||
'Authorization': f'Bearer {bearer_token}'
|
||||
}
|
||||
|
||||
try:
|
||||
log.debug(f"GET Request: {url}")
|
||||
response = requests.get(url, headers=header)
|
||||
response_json = response.json()
|
||||
return response_json
|
||||
except requests.exceptions.RequestException as e:
|
||||
log.error(f"Error in get_multiple_field_information: {e}")
|
||||
return None
|
||||
-14
@@ -1,14 +0,0 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Starup the predictify scraper
|
||||
|
||||
if test -f ./requirements.txt
|
||||
then
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r ./requirements.txt
|
||||
else
|
||||
printf "Missing requirements file! aborting...\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
.venv/bin/python3 src/runtime.py
|
||||
Reference in New Issue
Block a user