import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy from collections import Counter from scipy.stats import ttest_1samp, ttest_ind from scipy.stats import levene df = pd.read_csv('games.csv') df.columns = df.columns.map(lambda x: x.lower()) df = df.dropna(subset=['name']).reset_index(drop=True) df['year_of_release'] = df.groupby('name')['year_of_release'].transform(lambda x: x.fillna(x.mean())) df['critic_score'] = df.groupby('name')['critic_score'].transform(lambda x: x.fillna(x.mean())) df.loc[df.user_score=='tbd', 'user_score'] = np.nan df.user_score = df.user_score.astype('float') df['user_score'] = df.groupby('name')['user_score'].transform(lambda x: x.fillna(x.mean())) # создаем столбец когорт df['user_score_cat'] = df['user_score'].apply(lambda x: x//1) # Заполняем пропуски df.loc[df['user_score_cat']==0, 'critic_score'] = df.loc[df['user_score_cat']==0, 'critic_score'].fillna(52.250000) df.loc[df['user_score_cat']==1, 'critic_score'] = df.loc[df['user_score_cat']==1, 'critic_score'].fillna(43.340860) df.loc[df['user_score_cat']==2, 'critic_score'] = df.loc[df['user_score_cat']==2, 'critic_score'].fillna( 48.46782) df.loc[df['user_score_cat']==3, 'critic_score'] = df.loc[df['user_score_cat']==3, 'critic_score'].fillna( 50.38503) df.loc[df['user_score_cat']==4, 'critic_score'] = df.loc[df['user_score_cat']==4, 'critic_score'].fillna(56.278505) df.loc[df['user_score_cat']==5, 'critic_score'] = df.loc[df['user_score_cat']==5, 'critic_score'].fillna( 61.10962) df.loc[df['user_score_cat']==6, 'critic_score'] = df.loc[df['user_score_cat']==6, 'critic_score'].fillna( 65.62730) df.loc[df['user_score_cat']==7, 'critic_score'] = df.loc[df['user_score_cat']==7, 'critic_score'].fillna( 71.28083) df.loc[df['user_score_cat']==8, 'critic_score'] = df.loc[df['user_score_cat']==8, 'critic_score'].fillna( 76.69920) df.loc[df['user_score_cat']==9, 'critic_score'] = df.loc[df['user_score_cat']==9, 'critic_score'].fillna( 80.4016) df.drop('user_score_cat', axis='columns', inplace=True) df['rating'] = df.groupby('name')['rating'].transform(lambda x: x.fillna(Counter(x).most_common()[0][0])) df.rating.fillna('No rating', inplace=True) df['year_of_release'] = df.groupby('platform')['year_of_release'].transform(lambda x: x.fillna(np.mean(x))) df.year_of_release = df.year_of_release.astype('int') #суммарные продажи df['world_sales'] = df['na_sales'] + df['eu_sales'] + df['jp_sales'] + df['other_sales'] #Суммарные продажи у каждой платформы df.groupby('platform').sum()['world_sales'] most_popular = df.groupby('platform').sum()['world_sales'].sort_values(ascending=False).index.values[:10] popular_df = df[df.platform.isin(most_popular)] popular_df = popular_df.pivot_table(index='year_of_release', columns='platform', values='world_sales', aggfunc='sum') popular_df.plot(figsize=(8,6)) temp = popular_df.T.fillna(0).reset_index() temp[temp[2016]==0] temp.set_index('platform',inplace=True) life_of_platform = temp[temp!=0].count(axis=1) print('Среднее время жизни платформы:',life_of_platform.drop('PC').mean()) df_striped = df[(df['year_of_release']>=2008)].copy() df_striped.pivot_table(index='platform', columns='year_of_release', values='world_sales', aggfunc='sum').T.plot(figsize=(10,7)) df_popular = df_striped[df_striped.platform.isin(most_popular)] df_popular_grouped = df_popular.groupby('platform').sum() df_popular_grouped popular_platforms_list = df_popular_grouped.index.values popular_platforms_list fig, ax = plt.subplots(nrows=1, ncols=8, sharey=True, figsize=(30,6)) ax[0].scatter(data=df_popular[df_popular.platform=='DS'], x='user_score',y='world_sales') ax[1].scatter(data=df_popular[df_popular.platform=='PC'], x='user_score',y='world_sales') ax[2].scatter(data=df_popular[df_popular.platform=='PS2'], x='user_score',y='world_sales') ax[3].scatter(data=df_popular[df_popular.platform=='PS3'], x='user_score',y='world_sales') ax[4].scatter(data=df_popular[df_popular.platform=='PS4'], x='user_score',y='world_sales') ax[5].scatter(data=df_popular[df_popular.platform=='PSP'], x='user_score',y='world_sales') ax[6].scatter(data=df_popular[df_popular.platform=='Wii'], x='user_score',y='world_sales') ax[7].scatter(data=df_popular[df_popular.platform=='X360'], x='user_score',y='world_sales') fig.suptitle('Зависимость между user score и продажами у каждой платформы') ax[0].set_ylabel('World sales') for i,e in enumerate(ax[:]): e.set_xlabel('User score') platform_name = popular_platforms_list[i] e.set_title(platform_name) plt.show()