I began with the dataframe you gave as an example:
d = {
'Bar': ['Snickers', 'Mars Bars', 'Milky Way', 'Almond Joy', 'Babe Ruth'],
'User1': [0.01, 0.25, 0.9, np.nan, 0.5],
'User2': [np.nan, 0.4, 1.0, np.nan, 0.1],
'User3': [0.7, 0.1, np.nan, np.nan, 0.3]
}
df = pd.DataFrame(data = d)
Which looks like this:
Bar User1 User2 User3 0 Snickers 0.01 NaN 0.7 1 Mars Bars 0.25 0.4 0.1 2 Milky Way 0.90 1.0 NaN 3 Almond Joy NaN NaN NaN 4 Babe Ruth 0.50 0.1 0.3
The first thing I did was create a list of all columns that had user reviews:
user_cols = []
for col in df.columns.values:
if 'User' in col:
user_cols.append(col)
Where the final column S
contains all the S-scores for the candy bars. If you want you could then delete the v
, w
, and R
temporary columns: df = df.drop(['v', 'w', 'R'], axis=1)
:
Bar User1 User2 User3 S 0 Snickers 0.01 NaN 0.7 0.3905 1 Mars Bars 0.25 0.4 0.1 0.3204 2 Milky Way 0.90 1.0 NaN 0.6880 3 Almond Joy NaN NaN NaN NaN 4 Babe Ruth 0.50 0.1 0.3 0.3504
I'm attempting to extract a series of Bayesian averages, based on a dataframe (by row).,For example, say I have a series of (0 to 1) user ratings of candy bars, stored in a dataframe like so:,However, my calculation seems to be off, in such a way that as the number of User columns in my initial dataframe grows, the final calculated Bayesian average grows as well (into numbers greater than 1).,Next, I found it most straightforward to create each variable of the Bayesian Average equation either as a column in the dataframe, or as a standalone variable:
User1 User2 User3 Snickers 0.01 NaN 0.7 Mars Bars 0.25 0.4 0.1 Milky Way 0.9 1.0 NaN Almond Joy NaN NaN NaN Babe Ruth 0.5 0.1 0.3
d = {
'Bar': ['Snickers', 'Mars Bars', 'Milky Way', 'Almond Joy', 'Babe Ruth'],
'User1': [0.01, 0.25, 0.9, np.nan, 0.5],
'User2': [np.nan, 0.4, 1.0, np.nan, 0.1],
'User3': [0.7, 0.1, np.nan, np.nan, 0.3]
}
df = pd.DataFrame(data = d)
September 11, 2014
>>>
import pandas as pd
>>>
udata = pd.read_csv('u.data', names = ('user_id', 'movie_id', 'rating'), sep = '\t', usecols = [0, 1, 2]) >>>
uitem = pd.read_csv('u.item', names = ('movie_id', 'title'), sep = '|', usecols = [0, 1]) >>>
ratings = pd.merge(udata, uitem, on = 'movie_id') >>>
ratings.to_csv('ratings.csv', index = False)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections
import Counter
from operator
import itemgetter
RATINGS = os.path.join(os.path.dirname(__file__), 'ratings.csv')
class Ratings(object):
def __init__(self, path = RATINGS):
self.path = path
self.load()
def load(self):
self.data = pd.read_csv(self.path)
def __str__(self):
return str(self.data.head())
if __name__ == "__main__":
ratings = Ratings()
print ratings
class Ratings(object): ... @property def movies(self): "" " Returns the data grouped by Movie "" " return self.data.groupby('title') def get_means(self): return self.movies['rating'].mean() def get_counts(self): return self.movies['rating'].count() def top_movies(self, n = 10): grid = pd.DataFrame({ 'mean': self.get_means(), 'count': self.get_counts() }) return grid.ix[grid['mean'].argsort()[-n: ]] if __name__ == "__main__": ratings = Ratings() print ratings.top_movies()
count mean title Aiqing wansui(1994) 1 5 They Made Me a Criminal(1939) 1 5 Great Day in Harlem, A(1994) 1 5 Saint of Fort Washington, The(1993) 2 5 Entertaining Angels: The Dorothy Day Story(1996) 1 5 Someone Else 's America (1995) 1 5 Star Kid(1997) 3 5 Santa with Muscles(1996) 2 5 Prefontaine(1997) 3 5 Marlene Dietrich: Shadow and Light(1996) 1 5
class Ratings(object):
...
def plot_mean_frequency(self):
grid = pd.DataFrame({
'Mean Rating': self.movies['rating'].mean(),
'Number of Reviewers': self.movies['rating'].count()
})
grid.plot(x = 'Number of Reviewers', y = 'Mean Rating', kind = 'hexbin',
xscale = 'log', cmap = 'YlGnBu', gridsize = 12, mincnt = 1,
title = "Star Ratings by Simple Mean")
plt.show()
class Ratings(object):
def __init__(self, path = PATH, m = None, C = None):
self.path = path
self.prior = m
self.confidence = C
self.load()
def bayesian_mean(self, arr):
if not self.prior or not self.confidence:
raise TypeError("Bayesian mean must be computed with m and C")
return ((self.confidence * self.prior + arr.sum()) /
(self.confidence + arr.count()))
...
def get_bayesian_estimates(self):
return self.movies['rating'].agg(self.bayesian_mean)
def top_movies(self, n = 10):
grid = pd.DataFrame({
'mean': self.get_means(),
'count': self.get_counts(),
'bayes': self.get_bayesian_estimates()
})
return grid.ix[grid['bayes'].argsort()[-n: ]]