Exploratory Data Analysis

发布于 2022-05-28  147 次阅读


需要使用到的包

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

常用统计量的计算

df.label.mean()

df.label.median()

df.label.mode()求众数

df.label.min()

数据计数

df.label.value_counts()


# z.B.
ratings_df.movieId.value_counts()

groupby()

# use .groupby() to find the average rating per movieId
mean_rating_per_movie = ratings_df.groupby("movieId").rating.mean()

print(mean_rating_per_movie)

.agg()

一次性显示所有的统计量

# summary statistics for all ratings
ratings_df.rating.agg(['min', 'max', 'mean', 'median', 'count'])

绘图

线性图

.plot()

用法

plt.plot(x,y)
plt.show()
plt.figure(figsize=(10,4))# Set figure size
plt.plot(df.timestamp, df.X, 'k-')
plt.title('Activity recorded using watch accelerometer')# add title
plt.xlabel('Time (milliseconds)')# add xlabel
plt.show()

.axes()

用于设置图像的具体位置

# change figure size
plt.figure(figsize=(10,6))

# axes([x_lo, y_lo, width, height]), between 0 and 1
plt.axes([0.1, 0.1, 1.0, 1.0]) 
plt.plot(df.timestamp, df.X)
plt.xlabel('Time(ms)')
plt.ylim([-5, 20])
plt.title('X-axis')

# Overlaying y-axis plot on x-axis
plt.axes([0.7, 0.2, 0.3, 0.3]) 
plt.plot(df.timestamp, df.Y, color='r')
plt.xlabel('Time(ms)')
plt.title('Y-axis')

plt.show()

.plot()

plt.figure(figsize=(10,5))

# nrows, ncols, nsubplot - row-wise from top left
# indexed from 1

# X-axis
plt.subplot(1,3,1) 
plt.plot(df.timestamp[0:100], df.X[0:100])
plt.xlabel('Time(ms)')
plt.title('X-axis')

# Y-axis
plt.subplot(1,3,2) 
plt.plot(df.timestamp[0:100], df.Y[0:100])
plt.xlabel('Time(ms)')
plt.title('Y-axis')

# Z-axis
plt.subplot(1,3,3) 
plt.plot(df.timestamp[0:100], df.Z[0:100])
plt.xlabel('Time(ms)')
plt.title('Z-axis')

plt.show()

bar

plt.figure(figsize=(10,4))
genres_dist.plot(kind="bar", title="Genres distribution of rated movies of all time")
plt.show()

Pie charts

# How about removing the zero entries first so we don't show the 0%?
genres_20s = movies_prep.loc[is_20s,"(no genres listed)":"Western"].sum(axis=0)

# Try it: remove the zero entries
genres_20s = genres_20s[genres_20s>0]

plt.figure(figsize=(8,8))

genres_20s.plot.pie(autopct='%1.1f%%', pctdistance=0.85);

plt.ylabel('')

# alternative: pass argument labels=None in pie() above and uncomment below and see what happens
# plt.legend(genres_dist.index, loc='right', bbox_to_anchor=(1.3, 0.5));

plt.show()

histogram

# Get the movieIds whose genre include Fantasy from movies_df

fantasy_movie_ids = movies_prep.loc[movies_prep.Fantasy==1].movieId
musical_movie_ids = movies_prep.loc[movies_prep.Musical==1].movieId

# Get the ratings from ratings_df, then call hist directly

# Fantasy
ratings_df[ratings_df.movieId.isin(fantasy_movie_ids)].rating.hist(alpha=0.5, color='b', label='Fantasy')

# Musical
ratings_df[ratings_df.movieId.isin(musical_movie_ids)].rating.hist(alpha=0.5, color='r', label='Musical')

# add descriptive title and xlabel
plt.title('Distribution of rated movies with Fantasy and Musical genres')
plt.xlabel('Rating')

plt.legend()
plt.show()

追求理性 独立思考 不做韭菜