import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import linregress
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KDTree
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

data = pd.read_csv('anime.csv')
data2 = pd.read_csv('rating.csv')
data

data2

data = data.dropna() # Cleaning Data
data = data.drop_duplicates(subset=['anime_id'], keep= 'last')
plt.hist(data['rating'], edgecolor = 'black')
plt.title('Histogram of Anime Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
data['rating'].describe()

count    12017.000000
mean         6.478264
std          1.023857
min          1.670000
25%          5.890000
50%          6.570000
75%          7.180000
max         10.000000
Name: rating, dtype: float64

x = data.groupby('type')
x.size().plot(kind = 'bar', xlabel = 'Type', ylabel = 'Frequency', title = 'Anime Type Count')
plt.figure(figsize=(5,5))
plt.bar(x.groups.keys(),x.mean()['rating'])
plt.xlabel('Type')
plt.ylabel('Average Rating')
plt.title('Average Rating Across Anime Types')
plt.show()

frequency = dict() #Counts each genre. If an anime has more than one genre, it is double counted
scores = dict() #Stores average rating
for index, row in data.iterrows():
    genres = row['genre'].split(', ')
    for genre in genres:
        frequency[genre] = frequency.get(genre, 0) + 1
        scores[genre] = scores.get(genre, 0) + row['rating']
for key, value in frequency.items():
    scores[key] = scores.get(key,0)/frequency.get(key,0)
    
plt.figure(figsize=(40,5))
plt.bar(range(len(scores)), list(scores.values()), tick_label = list(scores.keys()))
plt.xlabel('Anime Genre')
plt.ylabel('Average Rating')
plt.title('Average Rating Across Anime Genres')
plt.show()
plt.figure(figsize=(40,5))
plt.xlabel('Anime Genre')
plt.ylabel('Frequency')
plt.title('Number of Anime per Anime Genre')
plt.bar(range(len(frequency)), list(frequency.values()), tick_label = list(frequency.keys()))

<BarContainer object of 43 artists>

x = list(frequency.values())
y = list(scores.values())
plt.scatter(x,y)
a, b = np.polyfit(x, y, 1)
plt.xlabel('Number of Anime')
plt.ylabel('Rating')
plt.title('Anime Genre Average Rating vs. Count')
plt.plot(x, (np.array(x)*a)+b)
linregress(x, y)

LinregressResult(slope=-1.7915622030846685e-05, intercept=6.749665182852247, rvalue=-0.039817495681292954, pvalue=0.7998761622286875, stderr=7.021363800846438e-05, intercept_stderr=0.08647336310841422)

genre_dummies = data['genre'].str.get_dummies(sep=',')#One Hot Encode
type_dummies = pd.get_dummies(data['type'])          
data10 = pd.concat([data, genre_dummies], axis = 1)
data10 = pd.concat([data10, type_dummies], axis = 1)
data10 = data10.drop(columns = ['genre','type','anime_id'])
data10 = data10[data10['episodes'] != 'Unknown']
data10

copy = data10.drop(columns = ['name']).astype(float)
scaler = MinMaxScaler() #Scales each column from 0-1
features = scaler.fit_transform(copy)
kdt = KDTree(features) #Nearest Neighbor algorithm for anime recommendations
index = kdt.query([copy.iloc[0]], k=10, return_distance=False) #Returns 10 anime recommendations for Kimi no Na wa.
for ind in index:
    print(data10.iloc[ind]['name']) #prints name of anime rcommendation

40                           Death Note
86                   Shingeki no Kyojin
804                    Sword Art Online
1      Fullmetal Alchemist: Brotherhood
159                        Angel Beats!
19      Code Geass: Hangyaku no Lelouch
841                              Naruto
3                           Steins;Gate
445                    Mirai Nikki (TV)
131                           Toradora!
Name: name, dtype: object

data2 = data2.dropna()
data2 = data2.drop_duplicates(subset=['anime_id','user_id'], keep= 'last')
plt.hist(data2['rating'], edgecolor = 'black')
plt.title('Histogram of Anime Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')

Text(0, 0.5, 'Frequency')

data2 = data2.groupby('user_id').filter(lambda x: len(x)>500) # Decrease Dataset for runtime and prevent notebook from crash on memory
data3 = data2.groupby('user_id').mean().drop(columns = 'anime_id')
data4 = pd.pivot_table(data2.reset_index(), 
               index='user_id', columns='anime_id', values='rating') #Pivot table for user rating of each anime
for index, row in data3.iterrows(): 
    data4.loc[index] = data4.loc[index].fillna(row['rating']) #Fill in not viwed show with user's average rating
data4

cos = cosine_similarity(data4) #Cosine similarity measures how similar two users are 
np.fill_diagonal(cos, 0 )
similar_anime =pd.DataFrame(cos,index=data4.index)
similar_anime.columns=data4.index
similar_anime.head() # measures how similar each user is to any other user

kmeans = KMeans(n_clusters=20).fit(similar_anime) #Use K means to group similar users together. Other approaches available.
labels = kmeans.labels_
data4['label'] = labels
target = data4.iloc[1]['label'] #As an example, we are seeing what anime to recommend for user_id 54
data4 = data4.groupby('label')

movies = []
x = data4.get_group(target).drop(columns = 'label').transpose()
x['mean'] = x.mean(axis=1)
x = x.sort_values(by=['mean'], ascending = False)       #Will see how each user judged a movie based on cluster.
count = 0
#Will pick movies with top 5 average score. Did not look to exclude movies that the current watched, but is easily implementable.
for index, row in x.iterrows():                
    if (count < 5):
        movies.append(index)
        count = count + 1         
    else:
        break
x

#Print Movies that user 54 would like
for movie in movies:
    print(data[data['anime_id'] == movie]['name'])

807    Bokurano
Name: name, dtype: object
1742    Fullmetal Alchemist: Premium Collection
Name: name, dtype: object
223    Clannad
Name: name, dtype: object
4480    Yosuga no Sora: In Solitude, Where We Are Leas...
Name: name, dtype: object
179    Gin no Saji 2nd Season
Name: name, dtype: object

	anime_id	name	genre	type	episodes	rating	members
0	32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630
1	5114	Fullmetal Alchemist: Brotherhood	Action, Adventure, Drama, Fantasy, Magic, Mili...	TV	64	9.26	793665
2	28977	Gintama°	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.25	114262
3	9253	Steins;Gate	Sci-Fi, Thriller	TV	24	9.17	673572
4	9969	Gintama'	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.16	151266
...	...	...	...	...	...	...	...
12289	9316	Toushindai My Lover: Minami tai Mecha-Minami	Hentai	OVA	1	4.15	211
12290	5543	Under World	Hentai	OVA	1	4.28	183
12291	5621	Violence Gekiga David no Hoshi	Hentai	OVA	4	4.88	219
12292	6133	Violence Gekiga Shin David no Hoshi: Inma Dens...	Hentai	OVA	1	4.98	175
12293	26081	Yasuji no Pornorama: Yacchimae!!	Hentai	Movie	1	5.46	142

	user_id	anime_id	rating
0	1	20	-1
1	1	24	-1
2	1	79	-1
3	1	226	-1
4	1	241	-1
...	...	...	...
7813732	73515	16512	7
7813733	73515	17187	9
7813734	73515	22145	10
7813735	73516	790	9
7813736	73516	8074	9

anime_id	1	5	6	7	8	15	16	17	18	19	...	34252	34283	34324	34325	34349	34358	34367	34475	34476	34519
user_id
17	4.351082	4.351082	7.000000	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082	10.000000	...	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082	4.351082
54	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	...	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000	-1.000000
201	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	...	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031	0.856031
226	8.000000	7.680593	8.000000	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	...	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593	7.680593
271	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	...	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287	7.372287
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
73378	9.000000	9.000000	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083	8.000000	7.935083	...	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083	7.935083
73395	10.000000	9.000000	10.000000	8.266667	8.266667	9.000000	8.266667	8.266667	8.266667	8.266667	...	8.266667	8.266667	8.266667	8.266667	8.266667	8.266667	8.266667	8.266667	8.266667	8.266667
73408	10.000000	10.000000	9.000000	0.742481	0.742481	0.742481	-1.000000	0.742481	0.742481	0.742481	...	0.742481	0.742481	0.742481	0.742481	0.742481	0.742481	0.742481	0.742481	0.742481	0.742481
73499	9.000000	7.832504	9.000000	7.832504	7.832504	10.000000	7.832504	7.832504	7.832504	7.832504	...	7.832504	7.832504	7.832504	7.832504	7.832504	7.832504	7.832504	7.832504	7.832504	7.832504
73502	8.486275	8.486275	8.486275	9.000000	8.486275	8.486275	10.000000	8.486275	8.486275	8.486275	...	8.486275	8.486275	8.486275	8.486275	8.486275	8.486275	8.486275	8.486275	8.486275	8.486275

user_id	17	54	201	226	271	294	342	392	446	478	...	73272	73286	73340	73356	73362	73378	73395	73408	73499	73502
user_id
17	0.000000	-0.978626	0.717752	0.978291	0.978297	0.978963	-0.141789	0.971066	0.978606	0.978851	...	0.976770	0.962211	0.978055	0.978534	0.978011	0.979068	0.978981	0.657635	0.979240	0.978218
54	-0.978626	0.000000	-0.729428	-0.999320	-0.998067	-0.999544	0.149240	-0.986644	-0.998229	-0.999552	...	-0.997657	-0.978811	-0.998040	-0.999252	-0.998600	-0.999432	-0.999557	-0.671318	-0.999348	-0.998878
201	0.717752	-0.729428	0.000000	0.729388	0.728997	0.729626	-0.084838	0.722958	0.728312	0.729244	...	0.725408	0.719273	0.729466	0.728688	0.729336	0.729428	0.729890	0.533101	0.729849	0.729096
226	0.978291	-0.999320	0.729388	0.000000	0.997508	0.998871	-0.148120	0.986036	0.997613	0.998805	...	0.997040	0.978624	0.997417	0.998559	0.997872	0.998692	0.998888	0.670239	0.998675	0.998172
271	0.978297	-0.998067	0.728997	0.997508	0.000000	0.997637	-0.146689	0.985432	0.996541	0.997832	...	0.995750	0.978020	0.996259	0.997381	0.996754	0.997731	0.997797	0.669478	0.997595	0.997054

user_id	54	917	940	1579	1870	2243	2264	2864	3325	3391	...	61622	62209	64174	65468	68017	68721	68787	68795	69121	mean
anime_id
1690	-1.0	-0.960396	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-0.989051	-1.0	-1.0	-1.0	-1.0	-0.983636	-1.0	-1.0	-1.0	-0.899399
908	-1.0	-0.960396	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-0.989051	-1.0	-1.0	-1.0	-1.0	-0.983636	-1.0	-1.0	-1.0	-0.899712
2167	-1.0	-0.960396	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-0.989051	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.900175
8861	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-0.989051	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.900275
19363	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.000000	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.900356
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
205	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.999735
226	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.999736
2001	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-0.997605	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-0.983636	-1.0	-1.0	-1.0	-0.999781
2993	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.000000	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.000000	-1.0	-1.0	-1.0	-0.999807
356	-1.0	-1.000000	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.0	-1.000000	...	-1.000000	-1.0	-1.0	-1.0	-1.0	-0.983636	-1.0	-1.0	-1.0	-0.999854

Anime Recommendation

Overview¶

Database¶

Preprocessing and Analyzing Anime Table¶

The Rating Distribution¶

Anime Types¶

Anime Genres¶

The Content Based Filtering Recommendation¶

Preprocessing User Rating Table¶

The Collaborative Based Filtering Approach¶

Conclusion¶