需要使用到的包
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
常用统计量的计算
df.label.mean()
df.label.median()
df.label.mode()求众数
df.label.min()
数据计数
df.label.value_counts()
# z.B.
ratings_df.movieId.value_counts()
groupby()
# use .groupby() to find the average rating per movieId
mean_rating_per_movie = ratings_df.groupby("movieId").rating.mean()
print(mean_rating_per_movie)
.agg()
一次性显示所有的统计量
# summary statistics for all ratings
ratings_df.rating.agg(['min', 'max', 'mean', 'median', 'count'])
绘图
线性图
.plot()
用法
plt.plot(x,y)
plt.show()
plt.figure(figsize=(10,4))# Set figure size
plt.plot(df.timestamp, df.X, 'k-')
plt.title('Activity recorded using watch accelerometer')# add title
plt.xlabel('Time (milliseconds)')# add xlabel
plt.show()
.axes()
用于设置图像的具体位置
# change figure size
plt.figure(figsize=(10,6))
# axes([x_lo, y_lo, width, height]), between 0 and 1
plt.axes([0.1, 0.1, 1.0, 1.0])
plt.plot(df.timestamp, df.X)
plt.xlabel('Time(ms)')
plt.ylim([-5, 20])
plt.title('X-axis')
# Overlaying y-axis plot on x-axis
plt.axes([0.7, 0.2, 0.3, 0.3])
plt.plot(df.timestamp, df.Y, color='r')
plt.xlabel('Time(ms)')
plt.title('Y-axis')
plt.show()
.plot()
plt.figure(figsize=(10,5))
# nrows, ncols, nsubplot - row-wise from top left
# indexed from 1
# X-axis
plt.subplot(1,3,1)
plt.plot(df.timestamp[0:100], df.X[0:100])
plt.xlabel('Time(ms)')
plt.title('X-axis')
# Y-axis
plt.subplot(1,3,2)
plt.plot(df.timestamp[0:100], df.Y[0:100])
plt.xlabel('Time(ms)')
plt.title('Y-axis')
# Z-axis
plt.subplot(1,3,3)
plt.plot(df.timestamp[0:100], df.Z[0:100])
plt.xlabel('Time(ms)')
plt.title('Z-axis')
plt.show()
bar
plt.figure(figsize=(10,4))
genres_dist.plot(kind="bar", title="Genres distribution of rated movies of all time")
plt.show()
Pie charts
# How about removing the zero entries first so we don't show the 0%?
genres_20s = movies_prep.loc[is_20s,"(no genres listed)":"Western"].sum(axis=0)
# Try it: remove the zero entries
genres_20s = genres_20s[genres_20s>0]
plt.figure(figsize=(8,8))
genres_20s.plot.pie(autopct='%1.1f%%', pctdistance=0.85);
plt.ylabel('')
# alternative: pass argument labels=None in pie() above and uncomment below and see what happens
# plt.legend(genres_dist.index, loc='right', bbox_to_anchor=(1.3, 0.5));
plt.show()
histogram
# Get the movieIds whose genre include Fantasy from movies_df
fantasy_movie_ids = movies_prep.loc[movies_prep.Fantasy==1].movieId
musical_movie_ids = movies_prep.loc[movies_prep.Musical==1].movieId
# Get the ratings from ratings_df, then call hist directly
# Fantasy
ratings_df[ratings_df.movieId.isin(fantasy_movie_ids)].rating.hist(alpha=0.5, color='b', label='Fantasy')
# Musical
ratings_df[ratings_df.movieId.isin(musical_movie_ids)].rating.hist(alpha=0.5, color='r', label='Musical')
# add descriptive title and xlabel
plt.title('Distribution of rated movies with Fantasy and Musical genres')
plt.xlabel('Rating')
plt.legend()
plt.show()
Comments | NOTHING