import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import math
import pickle

def save_obj(obj, name):
    with open('saved_content/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#global constants
num_train_slices = 1
num_test_slices = 1
num_predictions = 500
num_hidden_tracks = 4

df = pd.DataFrame()

prediction_df = pd.DataFrame()
hidden_df = pd.DataFrame()


for x in range(0, num_train_slices):
    #file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
    file_name = './data/train.json'
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        df = pd.concat([df, json_normalize(json_data['playlists'])])
        
print "Number of playlists loaded:", len(df)

Number of playlists loaded: 2

print "Main data:"
df.head()

Main data:

print "Number of rows containing description:", len(df[df['description'].notnull()])

Number of rows containing description: 2

Because so few rows contain description, we decided that song similarity using text matching and emotion will not be a major help towards playlist prediction.¶

df['name_words'] = df.name.str.strip().str.split('[\W_]+')
df['desc_words'] = df.description.str.strip().str.split('[\W_]+')

df.loc[df['name_words'].isnull(),'name_words'] = df.loc[df['name_words'].isnull(),'name_words'].apply(lambda x: [])
df.loc[df['desc_words'].isnull(),'desc_words'] = df.loc[df['desc_words'].isnull(),'desc_words'].apply(lambda x: [])

BAG OF WORDS MODEL¶

stop = set(stopwords.words('english'))
stemmer = PorterStemmer()

desc_word_columns = []
name_word_columns = []
for index, row in df.iterrows():
    desc_words_unstem = [word for word in row['desc_words'] if word not in stop]
    desc_words=[]
    for word in desc_words_unstem:
        if not (word==' ' or word==''):
            desc_words.append(stemmer.stem(word.lower()))
    desc_word_columns.append(desc_words)
    
    name_words_unstem = [word for word in row['name_words'] if word not in stop]
    name_words=[]
    for word in name_words_unstem:
        if not (word==' 'or word==''):
            name_words.append(stemmer.stem(word.lower()))
    name_word_columns.append(name_words)

df['desc_words'] = desc_word_columns
df['name_words'] = name_word_columns

df[df['description'].notnull()].head()

ticks = time.time()
columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','name_words','desc_words']

tracks_df_list = []

for row in df.itertuples():
    tracks = row.tracks
    for song in tracks:
        tracks_df_list.append([song['track_uri'], song['track_name'],song['artist_uri'],
                            song['artist_name'], song['album_uri'], song['album_name'],
                            row.name_words,row.desc_words])

tracks_df_tmp = pd.DataFrame(tracks_df_list, columns=columns)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))

Total time taken:0.00399994850159

ticks = time.time()

group_track_words = tracks_df_tmp.groupby('track_uri')
track_groups = group_track_words.groups

num_tracks = len(track_groups)

columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','words','count']

tracks_count = np.zeros(num_tracks) #To store the count of each track in the dataset
tracks_df_list = []
i=0
track_uri_count_dict = {}

for track in track_groups:
    tmp_list = []
    
    for index in track_groups[track]:
        row = tracks_df_tmp.iloc[index]
        tmp_list.extend([word for word in row['name_words']+row['desc_words']])
        
    tracks_df_list.append([row['track_uri'], row['track_name'],row['artist_uri'],
                        row['artist_name'], row['album_uri'], row['album_name'],
                        tmp_list,len(track_groups[track])])
    track_uri_count_dict[row['track_uri']] = len(track_groups[track])
    
tracks_df = pd.DataFrame(tracks_df_list,columns=columns)
print "Unique tracks:", len(tracks_df)
tracks_df.to_csv('saved_content/tracks_df.csv', encoding='utf-8', index=False)
save_obj(track_uri_count_dict,'track_uri_count_dict')

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
tracks_df.head(5)

Unique tracks: 89
Total time taken:0.0920000076294

bag_of_words = defaultdict(int)
count = 0

for index, row in tracks_df.iterrows():
    for item in row['words']:
        if item.lower() not in bag_of_words:
            bag_of_words[item.lower()] = count
            count += 1

A = np.zeros(shape=(len(tracks_df),len(bag_of_words)))

for index, row in tracks_df.iterrows():
    for item in row['words']:
        A[index][bag_of_words[item.lower()]] += 1

S = np.matmul(A, A.transpose())
norms = np.sum(A*A, axis=1)

S = S/np.sqrt(np.matmul(norms[:,None], norms[None,:]))

ART_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))
ALB_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))

import time
ticks = time.time()

groups_artists = tracks_df.groupby('artist_uri')['track_uri']
groups_album = tracks_df.groupby('album_uri')['track_uri']

from itertools import combinations

artist_track_map = groups_artists.groups
album_track_map = groups_album.groups
for artist in artist_track_map:
    for comb in combinations(artist_track_map[artist],2):
        ART_SIM[comb[0],comb[1]]=1
        ART_SIM[comb[1],comb[0]]=1

for album in album_track_map:
    for comb in combinations(album_track_map[album],2):
        ALB_SIM[comb[0],comb[1]]=1
        ALB_SIM[comb[1],comb[0]]=1
        
np.fill_diagonal(ART_SIM, 1)
np.fill_diagonal(ALB_SIM, 1)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))

Total time taken:0.00399994850159

test_df = pd.DataFrame()
for x in range(num_train_slices, num_train_slices+num_test_slices):
    #file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
    file_name = './data/test.json'
    with open(file_name) as json_file:
        json_data = json.load(json_file)
        test_df = pd.concat([test_df, json_normalize(json_data['playlists'])])

test_df.head()

def calculate_scores(uri):
    track_index = tracks_df.loc[tracks_df['track_uri']==uri].index[0]
    return np.add(S[track_index], np.add(ART_SIM[track_index], ALB_SIM[track_index]))

def get_predictions(row):
    
    score = np.zeros(len(tracks_df))
    
    this_playlist = set([])
    for song in row['tracks']:
        this_playlist.add(str(song['track_uri']))
        if any(tracks_df['track_uri'] == str(song['track_uri'])):   # for now, only if the song is in training data, we calculate
            score = np.add(score, calculate_scores(song['track_uri']))
    
    indices = np.argsort(-score)
    score[::-1].sort()
    
    count = 0
    sims = []
    predicted_tracks = []
    for x in range(len(tracks_df)):
        if count == num_predictions:
            break
        if str(tracks_df.iloc[indices[x]]['track_uri']) not in this_playlist:
            sims.append(tracks_df.iloc[indices[x]]['track_name'])
            predicted_tracks.append(tracks_df.iloc[indices[x]])
            count += 1
    return predicted_tracks

def get_predictions_test(row):
    sims = [track['track_name'] for track in get_predictions(row)]
    print "20 Suggested songs for playlist-", row['name'], ":"
    print sims, "\n"

    
for index,row in test_df.iterrows():
    get_predictions_test(row)

20 Suggested songs for playlist- Throwbacks second part :
[u"Jumpin', Jumpin'", u'Beep', u'Check On It - feat. Bun B and Slim Thug', u'How Do You Sleep? - Featuring Ludacris', u'A Thousand Miles', u'Everytime We Touch - Radio Edit', u'Sk8er Boi', u'Mr. Brightside', u'Run It!', u'All The Small Things', u'Baby', u'Your Love Is My Drug', u'Party In The U.S.A.', u'This Is Me', u'My Happy Ending', u'Me & U', u"Leavin'", u'Replay', u'Somebody To Love', u"Stacy's Mom", u'Soak Up The Sun', u'Dirty Little Secret', u'Forever', u'Where Is The Love?', u'Livin on Sunday', u'Check Yes Juliet', u'Paper Planes', u'Lip Gloss', u'Yo (Excuse Me Miss)', u'One Less Lonely Girl', u'Closer', u'Ice Box', u'Just The Girl', u'Right Where You Want Me - Radio Edit Version', u'Whatcha Say', u'Beautiful Soul', u'Year 3000', u'See You Again', u'Miss Independent', u'The Room Where It Happens', u'Right Hand Man', u'Limelight', u'Kashmir', u'I am the Doctor in Utah', u'Sympathy For The Devil', u'Tom Sawyer', u'Peace of Mind', u'Shine', u'Carry on Wayward Son', u'Eye of the Tiger', u'Red Barchetta', u'Concerning Hobbits (The Lord of the Rings)', u'U.N. Owen Was Her? (Remix)', u'Witch Hunt', u"He's a Pirate (Pirates of the Caribbean)", u'Bohemian Rhapsody - Remastered 2011', u'Fortunate Son', u'Stairway To Heaven', u'Libera Me From Hell (Tengen Toppa Gurren Lagann)', u'The Camera Eye', u'YYZ', u'Hold the Line', u'More Than a Feeling', u"Sweet Child O' Mine", u'Free Bird', u'Gimme Shelter', u'Crazy', u"Very Bloody Tears (Castlevania II: Simon's Quest)", u'Sweet Home Alabama', u'The Blood of Cuchulainn (The Boondock Saints)', u'Alexander Hamilton', u'Ghost Love Score', u'Pok\xe9mon Theme', u'My Shot', u'The Majestic Tale (Of A Madman In A Box)', u'Rock You Like A Hurricane', u'Foreplay / Long Time'] 

20 Suggested songs for playlist- Really Awesome Playlist :
[u'Pok\xe9mon Theme', u'Alexander Hamilton', u'My Shot', u'The Majestic Tale (Of A Madman In A Box)', u'Crazy', u'Free Bird', u'Sweet Home Alabama', u"Sweet Child O' Mine", u'Gimme Shelter', u'Shine', u'More Than a Feeling', u'Rock You Like A Hurricane', u'Peace of Mind', u'Sympathy For The Devil', u'Tom Sawyer', u'Limelight', u'Kashmir', u'Ghost Love Score', u'Hold the Line', u'Carry on Wayward Son', u'YYZ', u'Red Barchetta', u'Witch Hunt', u'Bohemian Rhapsody - Remastered 2011', u'Fortunate Son', u'Stairway To Heaven', u'The Camera Eye', u'Foreplay / Long Time', u'Lose Control (feat. Ciara & Fat Man Scoop)', u'Toxic', u'Forever', u"Leavin'", u"It Wasn't Me", u'Your Love Is My Drug', u'Baby', u'Paper Planes', u'My Boo', u'A Thousand Miles', u'Promiscuous', u'Beep', u'Where Is The Love?', u'Soak Up The Sun', u'See You Again', u'Just The Girl', u"Jumpin', Jumpin'", u'One Less Lonely Girl', u'Yo (Excuse Me Miss)', u'Check On It - feat. Bun B and Slim Thug', u'Check Yes Juliet', u'Livin on Sunday', u'Yeah!', u'Replay', u'Me & U', u'The Great Escape', u'Year 3000', u'Right Where You Want Me - Radio Edit Version', u'Run It!', u'Party In The U.S.A.', u'This Is Me', u'Miss Independent', u'Crazy In Love', u"Stacy's Mom", u'Somebody To Love', u'How Do You Sleep? - Featuring Ludacris', u'Closer', u'Whatcha Say', u'Beautiful Soul', u'Say My Name', u'My Happy Ending', u'All The Small Things', u'Rock Your Body', u'Everytime We Touch - Radio Edit', u'Sk8er Boi', u'Ice Box', u'Dirty Little Secret', u'Hey Ya! - Radio Mix / Club Mix', u'Lip Gloss', u'Buttons', u'Mr. Brightside']

ARTIST- ARTIST SIMILARITY¶

import time
ticks = time.time()
columns = ['pid', 'name', 'artist_uri', 'artist_name']

artist_df_list = []

for row in df.itertuples():
    tracks = row.tracks
    for song in tracks:
        artist_df_list.append([row.pid,row.name,song['artist_uri'],song['artist_name']])

artist_df_tmp = pd.DataFrame(artist_df_list, columns=columns)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
print(len(artist_df_tmp))
artist_df_tmp.head()

Total time taken:0.00399994850159
92

ticks = time.time()
artist_groups = artist_df_tmp.groupby('artist_uri').groups

num_artists = len(artist_groups)

artist_playlist_matrix = np.zeros((num_artists,num_train_slices*1000))#Matrix to store the artist vectors

artists_uri_train = []

i=0
for artist in artist_groups:
    artists_uri_train.append(artist)
    for index in artist_groups[artist]:
        pid = artist_df_tmp.iloc[index]['pid']
        artist_playlist_matrix[i][pid]=1
    i+=1

artists_uri_train = np.array(artists_uri_train)

time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))

Total time taken:0.0320000648499

print "Number of artists:", num_artists

Number of artists: 58

ticks1 = time.time()

from scipy import sparse
sparse_art_plst_mtrx = sparse.csr_matrix(artist_playlist_matrix)

from sklearn.metrics.pairwise import cosine_similarity
Artist_sim_weights = cosine_similarity(sparse_art_plst_mtrx)
np.save('saved_content/Artist_sim_weights', Artist_sim_weights)

ticks2 = time.time()

print "Time taken:", ticks2-ticks1

Time taken: 0.00800013542175

Art_to_ind_dict = {art:i for (i,),art in np.ndenumerate(artists_uri_train)}
save_obj(Art_to_ind_dict,'Art_to_ind_dict')

	collaborative	description	duration_ms	modified_at	name	num_albums	num_artists	num_edits	num_followers	num_tracks	pid	tracks
0	false	New songs are in Playlist	11532414	1493424000	Throwbacks	47	37	6	1	52	0	[{u'pos': 0, u'artist_name': u'Missy Elliott',...
1	false	Old songs are in Playlist	11656470	1506556800	Awesome Playlist	23	21	5	1	39	1	[{u'pos': 0, u'artist_name': u'Survivor', u'tr...

	collaborative	description	duration_ms	modified_at	name	num_albums	num_artists	num_edits	num_followers	num_tracks	pid	tracks	name_words	desc_words
0	false	New songs are in Playlist	11532414	1493424000	Throwbacks	47	37	6	1	52	0	[{u'pos': 0, u'artist_name': u'Missy Elliott',...	[throwback]	[new, song, playlist]
1	false	Old songs are in Playlist	11656470	1506556800	Awesome Playlist	23	21	5	1	39	1	[{u'pos': 0, u'artist_name': u'Survivor', u'tr...	[awesom, playlist]	[old, song, playlist]

	track_uri	track_name	artist_uri	artist_name	album_uri	album_name	words	count
0	spotify:track:1HZ3cUZUw5htSFmah1V8Ko	Shine	spotify:artist:4e5V1Q2dKCzbLVMQ8qbTn6	Collective Soul	spotify:album:3uSVy9q97sAzd5UE0dMdOT	Hints, Allegations & Things Left Unsaid	[awesom, playlist, old, song, playlist]	1
1	spotify:track:3H1LCvO3fVsK2HPguhbml0	Forever	spotify:artist:7bXgB6jMjp9ATFy66eO08Z	Chris Brown	spotify:album:1UtE4zAlSE2TlKmTFgrTg5	Exclusive - The Forever Edition	[throwback, new, song, playlist]	1
2	spotify:track:6UGHk2cmbDC1oidVjXcCKo	Red Barchetta	spotify:artist:2Hkut4rAAyrQxRdof7FVJq	Rush	spotify:album:2xg7iIKoSqaDNpDbJnyCjY	Moving Pictures	[awesom, playlist, old, song, playlist]	1
3	spotify:track:1X5WGCrUMuwRFuYU1eAo2I	Livin on Sunday	spotify:artist:7vWBZm3sQ8yQvfV4nXxHXK	Cris Cab	spotify:album:2kNznk4KDkYXifzOAUDoXN	Red Road	[throwback, new, song, playlist]	1
4	spotify:track:1b7vg5T9YKR3NNqXfBYRF7	Check Yes Juliet	spotify:artist:3ao3jf5d70Tf4fPh2bnXVl	We The Kings	spotify:album:2F1hfUOuMnOxtSfrktL8VX	We The Kings	[throwback, new, song, playlist]	1

	collaborative	description	duration_ms	modified_at	name	num_albums	num_artists	num_edits	num_followers	num_tracks	pid	tracks
0	false	Really New songs are in Playlist	11532414	1493424000	Throwbacks second part	47	37	6	1	52	0	[{u'pos': 0, u'artist_name': u'Missy Elliott',...
1	false	Old songs are in Playlist	11656470	1506556800	Really Awesome Playlist	23	21	5	1	39	1	[{u'pos': 0, u'artist_name': u'Survivor', u'tr...

	name	artist_uri	artist_name
0	Throwbacks	spotify:artist:2wIVse2owClT7go1WT98tk	Missy Elliott
1	Throwbacks	spotify:artist:26dSoYclwsYLMAKD3tpOr4	Britney Spears
2	Throwbacks	spotify:artist:6vWDO969PvNqNYHIOW5v0m	Beyoncé
3	Throwbacks	spotify:artist:31TPClRtHm23RisEBtV3X7	Justin Timberlake
4	Throwbacks	spotify:artist:5EvFsr3kj42KNv97ZEnqij	Shaggy