import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import math
import pickle
def save_obj(obj, name):
with open('saved_content/'+ name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
#global constants
num_train_slices = 1
num_test_slices = 1
num_predictions = 500
num_hidden_tracks = 4
df = pd.DataFrame()
prediction_df = pd.DataFrame()
hidden_df = pd.DataFrame()
for x in range(0, num_train_slices):
#file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
file_name = './data/train.json'
with open(file_name) as json_file:
json_data = json.load(json_file)
df = pd.concat([df, json_normalize(json_data['playlists'])])
print "Number of playlists loaded:", len(df)
print "Main data:"
df.head()
print "Number of rows containing description:", len(df[df['description'].notnull()])
df['name_words'] = df.name.str.strip().str.split('[\W_]+')
df['desc_words'] = df.description.str.strip().str.split('[\W_]+')
df.loc[df['name_words'].isnull(),'name_words'] = df.loc[df['name_words'].isnull(),'name_words'].apply(lambda x: [])
df.loc[df['desc_words'].isnull(),'desc_words'] = df.loc[df['desc_words'].isnull(),'desc_words'].apply(lambda x: [])
stop = set(stopwords.words('english'))
stemmer = PorterStemmer()
desc_word_columns = []
name_word_columns = []
for index, row in df.iterrows():
desc_words_unstem = [word for word in row['desc_words'] if word not in stop]
desc_words=[]
for word in desc_words_unstem:
if not (word==' ' or word==''):
desc_words.append(stemmer.stem(word.lower()))
desc_word_columns.append(desc_words)
name_words_unstem = [word for word in row['name_words'] if word not in stop]
name_words=[]
for word in name_words_unstem:
if not (word==' 'or word==''):
name_words.append(stemmer.stem(word.lower()))
name_word_columns.append(name_words)
df['desc_words'] = desc_word_columns
df['name_words'] = name_word_columns
df[df['description'].notnull()].head()
ticks = time.time()
columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','name_words','desc_words']
tracks_df_list = []
for row in df.itertuples():
tracks = row.tracks
for song in tracks:
tracks_df_list.append([song['track_uri'], song['track_name'],song['artist_uri'],
song['artist_name'], song['album_uri'], song['album_name'],
row.name_words,row.desc_words])
tracks_df_tmp = pd.DataFrame(tracks_df_list, columns=columns)
time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
ticks = time.time()
group_track_words = tracks_df_tmp.groupby('track_uri')
track_groups = group_track_words.groups
num_tracks = len(track_groups)
columns = ['track_uri', 'track_name', 'artist_uri', 'artist_name', 'album_uri', 'album_name','words','count']
tracks_count = np.zeros(num_tracks) #To store the count of each track in the dataset
tracks_df_list = []
i=0
track_uri_count_dict = {}
for track in track_groups:
tmp_list = []
for index in track_groups[track]:
row = tracks_df_tmp.iloc[index]
tmp_list.extend([word for word in row['name_words']+row['desc_words']])
tracks_df_list.append([row['track_uri'], row['track_name'],row['artist_uri'],
row['artist_name'], row['album_uri'], row['album_name'],
tmp_list,len(track_groups[track])])
track_uri_count_dict[row['track_uri']] = len(track_groups[track])
tracks_df = pd.DataFrame(tracks_df_list,columns=columns)
print "Unique tracks:", len(tracks_df)
tracks_df.to_csv('saved_content/tracks_df.csv', encoding='utf-8', index=False)
save_obj(track_uri_count_dict,'track_uri_count_dict')
time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
tracks_df.head(5)
bag_of_words = defaultdict(int)
count = 0
for index, row in tracks_df.iterrows():
for item in row['words']:
if item.lower() not in bag_of_words:
bag_of_words[item.lower()] = count
count += 1
A = np.zeros(shape=(len(tracks_df),len(bag_of_words)))
for index, row in tracks_df.iterrows():
for item in row['words']:
A[index][bag_of_words[item.lower()]] += 1
S = np.matmul(A, A.transpose())
norms = np.sum(A*A, axis=1)
S = S/np.sqrt(np.matmul(norms[:,None], norms[None,:]))
ART_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))
ALB_SIM = np.zeros(shape=(len(tracks_df),len(tracks_df)))
import time
ticks = time.time()
groups_artists = tracks_df.groupby('artist_uri')['track_uri']
groups_album = tracks_df.groupby('album_uri')['track_uri']
from itertools import combinations
artist_track_map = groups_artists.groups
album_track_map = groups_album.groups
for artist in artist_track_map:
for comb in combinations(artist_track_map[artist],2):
ART_SIM[comb[0],comb[1]]=1
ART_SIM[comb[1],comb[0]]=1
for album in album_track_map:
for comb in combinations(album_track_map[album],2):
ALB_SIM[comb[0],comb[1]]=1
ALB_SIM[comb[1],comb[0]]=1
np.fill_diagonal(ART_SIM, 1)
np.fill_diagonal(ALB_SIM, 1)
time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
test_df = pd.DataFrame()
for x in range(num_train_slices, num_train_slices+num_test_slices):
#file_name = './data/mpd.slice.' + str(x*1000) + '-' + str(x*1000+999) + '.json'
file_name = './data/test.json'
with open(file_name) as json_file:
json_data = json.load(json_file)
test_df = pd.concat([test_df, json_normalize(json_data['playlists'])])
test_df.head()
def calculate_scores(uri):
track_index = tracks_df.loc[tracks_df['track_uri']==uri].index[0]
return np.add(S[track_index], np.add(ART_SIM[track_index], ALB_SIM[track_index]))
def get_predictions(row):
score = np.zeros(len(tracks_df))
this_playlist = set([])
for song in row['tracks']:
this_playlist.add(str(song['track_uri']))
if any(tracks_df['track_uri'] == str(song['track_uri'])): # for now, only if the song is in training data, we calculate
score = np.add(score, calculate_scores(song['track_uri']))
indices = np.argsort(-score)
score[::-1].sort()
count = 0
sims = []
predicted_tracks = []
for x in range(len(tracks_df)):
if count == num_predictions:
break
if str(tracks_df.iloc[indices[x]]['track_uri']) not in this_playlist:
sims.append(tracks_df.iloc[indices[x]]['track_name'])
predicted_tracks.append(tracks_df.iloc[indices[x]])
count += 1
return predicted_tracks
def get_predictions_test(row):
sims = [track['track_name'] for track in get_predictions(row)]
print "20 Suggested songs for playlist-", row['name'], ":"
print sims, "\n"
for index,row in test_df.iterrows():
get_predictions_test(row)
import time
ticks = time.time()
columns = ['pid', 'name', 'artist_uri', 'artist_name']
artist_df_list = []
for row in df.itertuples():
tracks = row.tracks
for song in tracks:
artist_df_list.append([row.pid,row.name,song['artist_uri'],song['artist_name']])
artist_df_tmp = pd.DataFrame(artist_df_list, columns=columns)
time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
print(len(artist_df_tmp))
artist_df_tmp.head()
ticks = time.time()
artist_groups = artist_df_tmp.groupby('artist_uri').groups
num_artists = len(artist_groups)
artist_playlist_matrix = np.zeros((num_artists,num_train_slices*1000))#Matrix to store the artist vectors
artists_uri_train = []
i=0
for artist in artist_groups:
artists_uri_train.append(artist)
for index in artist_groups[artist]:
pid = artist_df_tmp.iloc[index]['pid']
artist_playlist_matrix[i][pid]=1
i+=1
artists_uri_train = np.array(artists_uri_train)
time_taken = time.time() - ticks
print('Total time taken:{}'.format(time_taken))
print "Number of artists:", num_artists
ticks1 = time.time()
from scipy import sparse
sparse_art_plst_mtrx = sparse.csr_matrix(artist_playlist_matrix)
from sklearn.metrics.pairwise import cosine_similarity
Artist_sim_weights = cosine_similarity(sparse_art_plst_mtrx)
np.save('saved_content/Artist_sim_weights', Artist_sim_weights)
ticks2 = time.time()
print "Time taken:", ticks2-ticks1
Art_to_ind_dict = {art:i for (i,),art in np.ndenumerate(artists_uri_train)}
save_obj(Art_to_ind_dict,'Art_to_ind_dict')