In [34]:
import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import math
import pickle
In [35]:
def save_obj(obj, name):
    with open('saved_content/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
In [36]:
#global constants
num_train_slices = 1
num_test_slices = 1
num_predictions = 500
num_hidden_tracks = 4
In [37]:
df = pd.DataFrame()

prediction_df = pd.DataFrame()
hidden_df = pd.DataFrame()


for x in range(0, num_train_slices):
    #file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
    file_name = './data/train.json'
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        df = pd.concat([df, json_normalize(json_data['playlists'])])
        
print "Number of playlists loaded:", len(df)
Number of playlists loaded: 2
In [38]:
print "Main data:"
df.head()
Main data:
Out[38]:
collaborative description duration_ms modified_at name num_albums num_artists num_edits num_followers num_tracks pid tracks
0 false New songs are in Playlist 11532414 1493424000 Throwbacks 47 37 6 1 52 0 [{u'pos': 0, u'artist_name': u'Missy Elliott',...
1 false Old songs are in Playlist 11656470 1506556800 Awesome Playlist 23 21 5 1 39 1 [{u'pos': 0, u'artist_name': u'Survivor', u'tr...
In [39]:
print "Number of rows containing description:", len(df[df['description'].notnull()])
Number of rows containing description: 2

Because so few rows contain description, we decided that song similarity using text matching and emotion will not be a major help towards playlist prediction.

In [40]:
df['name_words'] = df.name.str.strip().str.split('[\W_]+')
df['desc_words'] = df.description.str.strip().str.split('[\W_]+')

df.loc[df['name_words'].isnull(),'name_words'] = df.loc[df['name_words'].isnull(),'name_words'].apply(lambda x: [])
df.loc[df['desc_words'].isnull(),'desc_words'] = df.loc[df['desc_words'].isnull(),'desc_words'].apply(lambda x: [])

BAG OF WORDS MODEL

In [41]:
stop = set(stopwords.words('english'))
stemmer = PorterStemmer()

desc_word_columns = []
name_word_columns = []
for index, row in df.iterrows():
    desc_words_unstem = [word for word in row['desc_words'] if word not in stop]
    desc_words=[]
    for word in desc_words_unstem:
        if not (word==' ' or word==''):
            desc_words.append(stemmer.stem(word.lower()))
    desc_word_columns.append(desc_words)
    
    name_words_unstem = [word for word in row['name_words'] if word not in stop]
    name_words=[]
    for word in name_words_unstem:
        if not (word==' 'or word==''):
            name_words.append(stemmer.stem(word.lower()))
    name_word_columns.append(name_words)

df['desc_words'] = desc_word_columns
df['name_words'] = name_word_columns
In [42]:
df[df['description'].notnull()].head()
Out[42]:
collaborative description duration_ms modified_at name num_albums num_artists num_edits num_followers num_tracks pid tracks name_words desc_words
0 false New songs are in Playlist 11532414 1493424000 Throwbacks 47 37 6 1 52 0 [{u'pos': 0, u'artist_name': u'Missy Elliott',... [throwback] [new, song, playlist]
1 false Old songs are in Playlist 11656470 1506556800 Awesome Playlist 23 21 5 1 39 1 [{u'pos': 0, u'artist_name': u'Survivor', u'tr... [awesom, playlist] [old, song, playlist]
In [43]:
ticks = time.time()
columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','name_words','desc_words']

tracks_df_list = []

for row in df.itertuples():
    tracks = row.tracks
    for song in tracks:
        tracks_df_list.append([song['track_uri'], song['track_name'],song['artist_uri'],
                            song['artist_name'], song['album_uri'], song['album_name'],
                            row.name_words,row.desc_words])

tracks_df_tmp = pd.DataFrame(tracks_df_list, columns=columns)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
Total time taken:0.00399994850159
In [44]:
ticks = time.time()

group_track_words = tracks_df_tmp.groupby('track_uri')
track_groups = group_track_words.groups

num_tracks = len(track_groups)

columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','words','count']

tracks_count = np.zeros(num_tracks) #To store the count of each track in the dataset
tracks_df_list = []
i=0
track_uri_count_dict = {}

for track in track_groups:
    tmp_list = []
    
    for index in track_groups[track]:
        row = tracks_df_tmp.iloc[index]
        tmp_list.extend([word for word in row['name_words']+row['desc_words']])
        
    tracks_df_list.append([row['track_uri'], row['track_name'],row['artist_uri'],
                        row['artist_name'], row['album_uri'], row['album_name'],
                        tmp_list,len(track_groups[track])])
    track_uri_count_dict[row['track_uri']] = len(track_groups[track])
    
tracks_df = pd.DataFrame(tracks_df_list,columns=columns)
print "Unique tracks:", len(tracks_df)
tracks_df.to_csv('saved_content/tracks_df.csv', encoding='utf-8', index=False)
save_obj(track_uri_count_dict,'track_uri_count_dict')

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
tracks_df.head(5)
Unique tracks: 89
Total time taken:0.0920000076294
Out[44]:
track_uri track_name artist_uri artist_name album_uri album_name words count
0 spotify:track:1HZ3cUZUw5htSFmah1V8Ko Shine spotify:artist:4e5V1Q2dKCzbLVMQ8qbTn6 Collective Soul spotify:album:3uSVy9q97sAzd5UE0dMdOT Hints, Allegations & Things Left Unsaid [awesom, playlist, old, song, playlist] 1
1 spotify:track:3H1LCvO3fVsK2HPguhbml0 Forever spotify:artist:7bXgB6jMjp9ATFy66eO08Z Chris Brown spotify:album:1UtE4zAlSE2TlKmTFgrTg5 Exclusive - The Forever Edition [throwback, new, song, playlist] 1
2 spotify:track:6UGHk2cmbDC1oidVjXcCKo Red Barchetta spotify:artist:2Hkut4rAAyrQxRdof7FVJq Rush spotify:album:2xg7iIKoSqaDNpDbJnyCjY Moving Pictures [awesom, playlist, old, song, playlist] 1
3 spotify:track:1X5WGCrUMuwRFuYU1eAo2I Livin on Sunday spotify:artist:7vWBZm3sQ8yQvfV4nXxHXK Cris Cab spotify:album:2kNznk4KDkYXifzOAUDoXN Red Road [throwback, new, song, playlist] 1
4 spotify:track:1b7vg5T9YKR3NNqXfBYRF7 Check Yes Juliet spotify:artist:3ao3jf5d70Tf4fPh2bnXVl We The Kings spotify:album:2F1hfUOuMnOxtSfrktL8VX We The Kings [throwback, new, song, playlist] 1
In [45]:
bag_of_words = defaultdict(int)
count = 0

for index, row in tracks_df.iterrows():
    for item in row['words']:
        if item.lower() not in bag_of_words:
            bag_of_words[item.lower()] = count
            count += 1
In [46]:
A = np.zeros(shape=(len(tracks_df),len(bag_of_words)))
In [47]:
for index, row in tracks_df.iterrows():
    for item in row['words']:
        A[index][bag_of_words[item.lower()]] += 1
In [48]:
S = np.matmul(A, A.transpose())
norms = np.sum(A*A, axis=1)

S = S/np.sqrt(np.matmul(norms[:,None], norms[None,:]))
In [49]:
ART_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))
ALB_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))
In [50]:
import time
ticks = time.time()

groups_artists = tracks_df.groupby('artist_uri')['track_uri']
groups_album = tracks_df.groupby('album_uri')['track_uri']

from itertools import combinations

artist_track_map = groups_artists.groups
album_track_map = groups_album.groups
for artist in artist_track_map:
    for comb in combinations(artist_track_map[artist],2):
        ART_SIM[comb[0],comb[1]]=1
        ART_SIM[comb[1],comb[0]]=1

for album in album_track_map:
    for comb in combinations(album_track_map[album],2):
        ALB_SIM[comb[0],comb[1]]=1
        ALB_SIM[comb[1],comb[0]]=1
        
np.fill_diagonal(ART_SIM, 1)
np.fill_diagonal(ALB_SIM, 1)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
Total time taken:0.00399994850159
In [51]:
test_df = pd.DataFrame()
for x in range(num_train_slices, num_train_slices+num_test_slices):
    #file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
    file_name = './data/test.json'
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        test_df = pd.concat([test_df, json_normalize(json_data['playlists'])])
In [52]:
test_df.head()
Out[52]:
collaborative description duration_ms modified_at name num_albums num_artists num_edits num_followers num_tracks pid tracks
0 false Really New songs are in Playlist 11532414 1493424000 Throwbacks second part 47 37 6 1 52 0 [{u'pos': 0, u'artist_name': u'Missy Elliott',...
1 false Old songs are in Playlist 11656470 1506556800 Really Awesome Playlist 23 21 5 1 39 1 [{u'pos': 0, u'artist_name': u'Survivor', u'tr...
In [53]:
def calculate_scores(uri):
    track_index = tracks_df.loc[tracks_df['track_uri']==uri].index[0]
    return np.add(S[track_index], np.add(ART_SIM[track_index], ALB_SIM[track_index]))
In [54]:
def get_predictions(row):
    
    score = np.zeros(len(tracks_df))
    
    this_playlist = set([])
    for song in row['tracks']:
        this_playlist.add(str(song['track_uri']))
        if any(tracks_df['track_uri'] == str(song['track_uri'])):   # for now, only if the song is in training data, we calculate
            score = np.add(score, calculate_scores(song['track_uri']))
    
    indices = np.argsort(-score)
    score[::-1].sort()
    
    count = 0
    sims = []
    predicted_tracks = []
    for x in range(len(tracks_df)):
        if count == num_predictions:
            break
        if str(tracks_df.iloc[indices[x]]['track_uri']) not in this_playlist:
            sims.append(tracks_df.iloc[indices[x]]['track_name'])
            predicted_tracks.append(tracks_df.iloc[indices[x]])
            count += 1
    return predicted_tracks

def get_predictions_test(row):
    sims = [track['track_name'] for track in get_predictions(row)]
    print "20 Suggested songs for playlist-", row['name'], ":"
    print sims, "\n"

    
for index,row in test_df.iterrows():
    get_predictions_test(row)   
20 Suggested songs for playlist- Throwbacks second part :
[u"Jumpin', Jumpin'", u'Beep', u'Check On It - feat. Bun B and Slim Thug', u'How Do You Sleep? - Featuring Ludacris', u'A Thousand Miles', u'Everytime We Touch - Radio Edit', u'Sk8er Boi', u'Mr. Brightside', u'Run It!', u'All The Small Things', u'Baby', u'Your Love Is My Drug', u'Party In The U.S.A.', u'This Is Me', u'My Happy Ending', u'Me & U', u"Leavin'", u'Replay', u'Somebody To Love', u"Stacy's Mom", u'Soak Up The Sun', u'Dirty Little Secret', u'Forever', u'Where Is The Love?', u'Livin on Sunday', u'Check Yes Juliet', u'Paper Planes', u'Lip Gloss', u'Yo (Excuse Me Miss)', u'One Less Lonely Girl', u'Closer', u'Ice Box', u'Just The Girl', u'Right Where You Want Me - Radio Edit Version', u'Whatcha Say', u'Beautiful Soul', u'Year 3000', u'See You Again', u'Miss Independent', u'The Room Where It Happens', u'Right Hand Man', u'Limelight', u'Kashmir', u'I am the Doctor in Utah', u'Sympathy For The Devil', u'Tom Sawyer', u'Peace of Mind', u'Shine', u'Carry on Wayward Son', u'Eye of the Tiger', u'Red Barchetta', u'Concerning Hobbits (The Lord of the Rings)', u'U.N. Owen Was Her? (Remix)', u'Witch Hunt', u"He's a Pirate (Pirates of the Caribbean)", u'Bohemian Rhapsody - Remastered 2011', u'Fortunate Son', u'Stairway To Heaven', u'Libera Me From Hell (Tengen Toppa Gurren Lagann)', u'The Camera Eye', u'YYZ', u'Hold the Line', u'More Than a Feeling', u"Sweet Child O' Mine", u'Free Bird', u'Gimme Shelter', u'Crazy', u"Very Bloody Tears (Castlevania II: Simon's Quest)", u'Sweet Home Alabama', u'The Blood of Cuchulainn (The Boondock Saints)', u'Alexander Hamilton', u'Ghost Love Score', u'Pok\xe9mon Theme', u'My Shot', u'The Majestic Tale (Of A Madman In A Box)', u'Rock You Like A Hurricane', u'Foreplay / Long Time'] 

20 Suggested songs for playlist- Really Awesome Playlist :
[u'Pok\xe9mon Theme', u'Alexander Hamilton', u'My Shot', u'The Majestic Tale (Of A Madman In A Box)', u'Crazy', u'Free Bird', u'Sweet Home Alabama', u"Sweet Child O' Mine", u'Gimme Shelter', u'Shine', u'More Than a Feeling', u'Rock You Like A Hurricane', u'Peace of Mind', u'Sympathy For The Devil', u'Tom Sawyer', u'Limelight', u'Kashmir', u'Ghost Love Score', u'Hold the Line', u'Carry on Wayward Son', u'YYZ', u'Red Barchetta', u'Witch Hunt', u'Bohemian Rhapsody - Remastered 2011', u'Fortunate Son', u'Stairway To Heaven', u'The Camera Eye', u'Foreplay / Long Time', u'Lose Control (feat. Ciara & Fat Man Scoop)', u'Toxic', u'Forever', u"Leavin'", u"It Wasn't Me", u'Your Love Is My Drug', u'Baby', u'Paper Planes', u'My Boo', u'A Thousand Miles', u'Promiscuous', u'Beep', u'Where Is The Love?', u'Soak Up The Sun', u'See You Again', u'Just The Girl', u"Jumpin', Jumpin'", u'One Less Lonely Girl', u'Yo (Excuse Me Miss)', u'Check On It - feat. Bun B and Slim Thug', u'Check Yes Juliet', u'Livin on Sunday', u'Yeah!', u'Replay', u'Me & U', u'The Great Escape', u'Year 3000', u'Right Where You Want Me - Radio Edit Version', u'Run It!', u'Party In The U.S.A.', u'This Is Me', u'Miss Independent', u'Crazy In Love', u"Stacy's Mom", u'Somebody To Love', u'How Do You Sleep? - Featuring Ludacris', u'Closer', u'Whatcha Say', u'Beautiful Soul', u'Say My Name', u'My Happy Ending', u'All The Small Things', u'Rock Your Body', u'Everytime We Touch - Radio Edit', u'Sk8er Boi', u'Ice Box', u'Dirty Little Secret', u'Hey Ya! - Radio Mix / Club Mix', u'Lip Gloss', u'Buttons', u'Mr. Brightside'] 

ARTIST- ARTIST SIMILARITY

In [55]:
import time
ticks = time.time()
columns = ['pid', 'name', 'artist_uri', 'artist_name']

artist_df_list = []

for row in df.itertuples():
    tracks = row.tracks
    for song in tracks:
        artist_df_list.append([row.pid,row.name,song['artist_uri'],song['artist_name']])

artist_df_tmp = pd.DataFrame(artist_df_list, columns=columns)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
print(len(artist_df_tmp))
artist_df_tmp.head()
Total time taken:0.00399994850159
92
Out[55]:
pid name artist_uri artist_name
0 0 Throwbacks spotify:artist:2wIVse2owClT7go1WT98tk Missy Elliott
1 0 Throwbacks spotify:artist:26dSoYclwsYLMAKD3tpOr4 Britney Spears
2 0 Throwbacks spotify:artist:6vWDO969PvNqNYHIOW5v0m Beyoncé
3 0 Throwbacks spotify:artist:31TPClRtHm23RisEBtV3X7 Justin Timberlake
4 0 Throwbacks spotify:artist:5EvFsr3kj42KNv97ZEnqij Shaggy
In [56]:
ticks = time.time()
artist_groups = artist_df_tmp.groupby('artist_uri').groups

num_artists = len(artist_groups)

artist_playlist_matrix = np.zeros((num_artists,num_train_slices*1000))#Matrix to store the artist vectors

artists_uri_train = []

i=0
for artist in artist_groups:
    artists_uri_train.append(artist)
    for index in artist_groups[artist]:
        pid = artist_df_tmp.iloc[index]['pid']
        artist_playlist_matrix[i][pid]=1
    i+=1

artists_uri_train = np.array(artists_uri_train)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
Total time taken:0.0320000648499
In [57]:
print "Number of artists:", num_artists
Number of artists: 58
In [58]:
ticks1 = time.time()

from scipy import sparse
sparse_art_plst_mtrx = sparse.csr_matrix(artist_playlist_matrix)

from sklearn.metrics.pairwise import cosine_similarity
Artist_sim_weights = cosine_similarity(sparse_art_plst_mtrx)
np.save('saved_content/Artist_sim_weights', Artist_sim_weights)

ticks2 = time.time()

print "Time taken:", ticks2-ticks1
Time taken: 0.00800013542175
In [59]:
Art_to_ind_dict = {art:i for (i,),art in np.ndenumerate(artists_uri_train)}
save_obj(Art_to_ind_dict,'Art_to_ind_dict')