Basic Data Manipulation in Python (with IPython)

Data and scripts came from OReilly.Python.For.Data.Analysis.Oct.2012.ISBN.1449319793

With a lot of adjustments for Python3 since the book used Python 2.7

import pandas as pd
#######movie data
unames=["user_id","gender","age","occupation","zip"]
users=pd.read_table("ml/users.dat",sep="::",names=unames, header=None, engine="python")

rnames=["user_id","movie_id","rating","timestamp"]
ratings=pd.read_table("ml/ratings.dat",sep="::",names=rnames, header=None,engine="python")

mnames=["movie_id","title","genres"]
movies=pd.read_table("ml/movies.dat",sep="::",names=mnames, header=None,engine="python")
# in shell or IPython, add this first: os.chdir("/Users/Charles/Desktop/pyp/")

users[:5]

mean_ratings=data_movie.pivot_table("rating",index="title",columns="gender",aggfunc="mean")
mean_ratings.ix[0]
ratings_by_title=data_movie.groupby("title").size()
ratings_by_title
top_f_ratings=mean_ratings.sort_values(by=“F”,ascending=False)
mean_ratings["diff"]=mean_ratings["M"]-mean_ratings["F"]
sort_by_diff=mean_ratings.sort_values(by="diff")

######US babies data
names1880=pd.read_csv("baby_names/yob1880.txt", names=["name","seex","births"])
years=range(1880,2015)
pieces=[]

columns=["names","sex","births"]

for year in years:
path="baby_names/yob%d.txt"%year
frame=pd.read_csv(path,names=columns)
frame["year"]=year
pieces.append(frame)
names=pd.concat(pieces,ignore_index=True)

total_births=names.pivot_table(values=“births”,index="year",columns="sex",aggfunc=sum)
total_births.tail()

%matplotlib
total_births.plot(title="Total births by sex and year")

Total Birhts.png


def add_prob(group):
births=group.births.astype(float)
group["prop"]=births/births.sum()
return group

names=names.groupby(["year","sex"]).apply(add_prob)
pieces=[]
for year,group in names.groupby(["year","sex"]):
pieces.append(group.sort_values(by="births",ascending=False)[:1000])

top1000=pd.concat(pieces,ignore_index=True)
boys=top1000[top1000.sex=="M"]
girls=top1000[top1000.sex=="F"]

total_births=top1000.pivot_table(values="births",index="year",columns="names",aggfunc=sum)

subset=total_births[["John","Harry","Mary","Marilyn"]]
subset.plot(subplots=True,figsize=(12,10),grid=False,title="Number of birhts per year")

Number of birhts per year.png


table=top1000.pivot_table(values="prop",index="year",columns="sex",aggfunc=sum)

import numpy as np
table.plot(title="Sum by year and sex",yticks=np.linspace(0,1.2,13),
xticks=range(1880,2020,10))

Sum by Year and Sex.png

df=boys[boys.year==2010]
prop_cumsum=df.sort_values(by="prop").prop.cumsum()
prop_cumsum[:10]
prop_cumsum.searchsorted(0.5)

def get_quantile_count(group,q=0.5):
group=group.sort_values(by="prop",ascending=False)
return group.prop.cumsum().searchsorted(q)+1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

diversity['M']=[int(x)for x in diversity['M']]
diversity['F']=[int(x)for x in diversity['F']]
diversity.head()
diversity.plot(title="Number of popular names in top 50%")

Number of popular names in top 50%.png

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s