bayesian averaging in a dataframe

  • Last Update :
  • Techknowledgy :

I began with the dataframe you gave as an example:

d = {
   'Bar': ['Snickers', 'Mars Bars', 'Milky Way', 'Almond Joy', 'Babe Ruth'],
   'User1': [0.01, 0.25, 0.9, np.nan, 0.5],
   'User2': [np.nan, 0.4, 1.0, np.nan, 0.1],
   'User3': [0.7, 0.1, np.nan, np.nan, 0.3]
}

df = pd.DataFrame(data = d)

Which looks like this:

    Bar User1 User2 User3
    0 Snickers 0.01 NaN 0.7
    1 Mars Bars 0.25 0.4 0.1
    2 Milky Way 0.90 1.0 NaN
    3 Almond Joy NaN NaN NaN
    4 Babe Ruth 0.50 0.1 0.3

The first thing I did was create a list of all columns that had user reviews:

user_cols = []
for col in df.columns.values:
   if 'User' in col:
   user_cols.append(col)

Where the final column S contains all the S-scores for the candy bars. If you want you could then delete the v, w, and R temporary columns: df = df.drop(['v', 'w', 'R'], axis=1):

    Bar User1 User2 User3 S
    0 Snickers 0.01 NaN 0.7 0.3905
    1 Mars Bars 0.25 0.4 0.1 0.3204
    2 Milky Way 0.90 1.0 NaN 0.6880
    3 Almond Joy NaN NaN NaN NaN
    4 Babe Ruth 0.50 0.1 0.3 0.3504

Suggestion : 2

I'm attempting to extract a series of Bayesian averages, based on a dataframe (by row).,For example, say I have a series of (0 to 1) user ratings of candy bars, stored in a dataframe like so:,However, my calculation seems to be off, in such a way that as the number of User columns in my initial dataframe grows, the final calculated Bayesian average grows as well (into numbers greater than 1).,Next, I found it most straightforward to create each variable of the Bayesian Average equation either as a column in the dataframe, or as a standalone variable:


            User1 User2 User3 Snickers 0.01 NaN 0.7 Mars Bars 0.25 0.4 0.1 Milky Way 0.9 1.0 NaN Almond Joy NaN NaN NaN Babe Ruth 0.5 0.1 0.3

d = {
   'Bar': ['Snickers', 'Mars Bars', 'Milky Way', 'Almond Joy', 'Babe Ruth'],
   'User1': [0.01, 0.25, 0.9, np.nan, 0.5],
   'User2': [np.nan, 0.4, 1.0, np.nan, 0.1],
   'User3': [0.7, 0.1, np.nan, np.nan, 0.3]
}
df = pd.DataFrame(data = d)

Suggestion : 3

September 11, 2014

>>>
import pandas as pd
   >>>
   udata = pd.read_csv('u.data', names = ('user_id', 'movie_id', 'rating'), sep = '\t', usecols = [0, 1, 2]) >>>
   uitem = pd.read_csv('u.item', names = ('movie_id', 'title'), sep = '|', usecols = [0, 1]) >>>
   ratings = pd.merge(udata, uitem, on = 'movie_id') >>>
   ratings.to_csv('ratings.csv', index = False)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections
import Counter
from operator
import itemgetter

RATINGS = os.path.join(os.path.dirname(__file__), 'ratings.csv')

class Ratings(object):

   def __init__(self, path = RATINGS):
   self.path = path
self.load()

def load(self):
   self.data = pd.read_csv(self.path)

def __str__(self):
   return str(self.data.head())

if __name__ == "__main__":
   ratings = Ratings()
print ratings
class Ratings(object):

   ...

   @property
def movies(self):
   ""
"
Returns the data grouped by Movie
   ""
"
return self.data.groupby('title')

def get_means(self):
   return self.movies['rating'].mean()

def get_counts(self):
   return self.movies['rating'].count()

def top_movies(self, n = 10):
   grid = pd.DataFrame({
      'mean': self.get_means(),
      'count': self.get_counts()
   })
return grid.ix[grid['mean'].argsort()[-n: ]]

if __name__ == "__main__":
   ratings = Ratings()
print ratings.top_movies()
                                                   count mean
                                                   title
                                                   Aiqing wansui(1994) 1 5
                                                   They Made Me a Criminal(1939) 1 5
                                                   Great Day in Harlem, A(1994) 1 5
                                                   Saint of Fort Washington, The(1993) 2 5
                                                   Entertaining Angels: The Dorothy Day Story(1996) 1 5
                                                   Someone Else 's America (1995)                          1     5
                                                   Star Kid(1997) 3 5
                                                   Santa with Muscles(1996) 2 5
                                                   Prefontaine(1997) 3 5
                                                   Marlene Dietrich: Shadow and Light(1996) 1 5
class Ratings(object):

   ...

   def plot_mean_frequency(self):
   grid = pd.DataFrame({
      'Mean Rating': self.movies['rating'].mean(),
      'Number of Reviewers': self.movies['rating'].count()
   })

grid.plot(x = 'Number of Reviewers', y = 'Mean Rating', kind = 'hexbin',
   xscale = 'log', cmap = 'YlGnBu', gridsize = 12, mincnt = 1,
   title = "Star Ratings by Simple Mean")
plt.show()
class Ratings(object):

   def __init__(self, path = PATH, m = None, C = None):
   self.path = path
self.prior = m
self.confidence = C
self.load()

def bayesian_mean(self, arr):
   if not self.prior or not self.confidence:
   raise TypeError("Bayesian mean must be computed with m and C")

return ((self.confidence * self.prior + arr.sum()) /
      (self.confidence + arr.count()))

   ...

   def get_bayesian_estimates(self):
   return self.movies['rating'].agg(self.bayesian_mean)

def top_movies(self, n = 10):
   grid = pd.DataFrame({
      'mean': self.get_means(),
      'count': self.get_counts(),
      'bayes': self.get_bayesian_estimates()
   })
return grid.ix[grid['bayes'].argsort()[-n: ]]