第14章数据分析案例

14.1 来⾃Bitly的USA.gov数据

读取json数据:

import json

path = 'datasets/bitly_usagov/example.txt'

print(open(path).readline())
# { "a": "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\
# /535.11 (KHTML, like Gecko) Chrome\/17.0.963.78 Safari\
# /535.11", "c": "US", "nk": 1, "tz": "America\/New_York",
#  "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog",
#   "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": 
#   "http:\/\/www.facebook.com\/l\/7AQEFzjSi\/1.usa.gov\/wfLQtf", "u": 
#   "http:\/\/www.ncbi.nlm.nih.gov\/pubmed\/22415991", "t": 1331923247, 
#   "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }

with open(path) as f:
	records = [json.loads(line) for line in f.readlines()]

print(records[0])
# {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11', 
# 'c': 'US', 'nk': 1, 'tz': 'America/New_York', 'gr': 'MA', 'g': 'A6qOVH', 'h': 'wfLQtf', 'l': 'orofrog',
#  'al': 'en-US,en;q=0.8', 'hh': '1.usa.gov', 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
#   'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991', 't': 1331923247, 'hc': 1331822918, 'cy': 'Danvers', 
#   'll': [42.576698, -70.954903]}


# records[0]就是一个字典
print(type(records[0]))
# <class 'dict'>

⽤纯Python代码对时区进⾏计数

import json

path = 'datasets/bitly_usagov/example.txt'
with open(path) as f:
	records = [json.loads(line) for line in f.readlines()]


# 因为并不是所有的字典都有tz键,所以先判断一下
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
# 结果在存在空值
print(time_zones[:10])
# ['America/New_York', 'America/Denver', 'America/New_York', 
# 'America/Sao_Paulo', 'America/New_York', 'America/New_York', 
# 'Europe/Warsaw', '', '', '']


from collections import defaultdict

# 对序列计数
def get_counts(sequence):
	# # values will initialize to 0
    counts = defaultdict(int) 
    for x in sequence:
        counts[x] += 1
    return counts


counts = get_counts(time_zones)
print(counts['America/New_York'])
# 1251

# 对全部的时区计数
print(len(time_zones))
# 3440

# 得到前n位的时区及其计数值
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

print(top_counts(counts))
# [(33, 'America/Sao_Paulo'), (35, 'Europe/Madrid'),...]


from collections import Counter
counts = Counter(time_zones)
print(counts.most_common(10))
# [('America/New_York', 1251), ('', 521), ...]

⽤pandas对时区进⾏计数

import json

import pandas as pd


path = 'datasets/bitly_usagov/example.txt'
with open(path) as f:
	records = [json.loads(line) for line in f.readlines()]


frame = pd.DataFrame(records)

print(frame.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3560 entries, 0 to 3559
# Data columns (total 18 columns):
# _heartbeat_    120 non-null float64
# a              3440 non-null object
# al             3094 non-null object
# c              2919 non-null object
# cy             2919 non-null object
# g              3440 non-null object
# gr             2919 non-null object
# h              3440 non-null object
# hc             3440 non-null float64
# hh             3440 non-null object
# kw             93 non-null object
# l              3440 non-null object
# ll             2919 non-null object
# nk             3440 non-null float64
# r              3440 non-null object
# t              3440 non-null float64
# tz             3440 non-null object
# u              3440 non-null object
# dtypes: float64(4), object(14)
# memory usage: 500.7+ KB


# 获取tx列的前10个.(对应之前代码:获取每个字典的tz键的值)
print(frame['tz'][:10])
# 0     America/New_York
# 1       America/Denver
# 2     America/New_York
# 3    America/Sao_Paulo
# 4     America/New_York
# 5     America/New_York
# 6        Europe/Warsaw
# 7                     
# 8                     
# 9                     
# Name: tz, dtype: object


# 对tz列根据值计数
tz_counts = frame['tz'].value_counts()
print(tz_counts[:10])
# America/New_York       1251
#                         521
# America/Chicago         400
# America/Los_Angeles     382
# America/Denver          191
# Europe/London            74
# Asia/Tokyo               37
# Pacific/Honolulu         36
# Europe/Madrid            35
# America/Sao_Paulo        33
# Name: tz, dtype: int64


# 填充NA
clean_tz = frame['tz'].fillna('Missing')
# 空字符串改为Unkown
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

# 清洗后的数据
print(tz_counts[:10])
# America/New_York       1251
# Unknown                 521
# America/Chicago         400
# America/Los_Angeles     382
# America/Denver          191
# Missing                 120
# Europe/London            74
# Asia/Tokyo               37
# Pacific/Honolulu         36
# Europe/Madrid            35
# Name: tz, dtype: int64

我们可以⽤matplotlib可视化这个数据

import seaborn as sns
import matplotlib.pyplot as plt 


subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)

plt.show()

1557920958047

继续做后续处理:

# 将a列截短一点
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
# 0               Mozilla/5.0
# 1    GoogleMaps/RochesterNY
# 2               Mozilla/4.0
# 3               Mozilla/5.0
# 4               Mozilla/5.0
# dtype: object

print(results.value_counts()[:8])
# Mozilla/5.0                 2594
# Mozilla/4.0                  601
# GoogleMaps/RochesterNY       121
# Opera/9.80                    34
# TEST_INTERNET_AGENT           24
# GoogleProducer                21
# Mozilla/6.0                    5
# BlackBerry8520/5.0.0.681       4
# dtype: int64

import numpy as np

# 丢弃a列为空的行
cframe = frame[frame.a.notnull()]

# 按Windows和⾮Windows⽤户添加一个os列
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),'Windows', 'Not Windows')
print(cframe['os'][:5])
# 0        Windows
# 1    Not Windows
# 2        Windows
# 3    Not Windows
# 4        Windows
# Name: os, dtype: object

# 按时区和系统分组操作
by_tz_os = cframe.groupby(['tz', 'os'])

# 使用size计数
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])
# os                              Not Windows  Windows
# tz                                                  
#                                       245.0    276.0
# Africa/Cairo                            0.0      3.0
# Africa/Casablanca                       0.0      1.0
# Africa/Ceuta                            0.0      2.0
# Africa/Johannesburg                     0.0      1.0
# Africa/Lusaka                           0.0      1.0
# America/Anchorage                       4.0      1.0
# America/Argentina/Buenos_Aires          1.0      0.0
# America/Argentina/Cordoba               0.0      1.0
# America/Argentina/Mendoza               0.0      1.0


# 排序(升序)
indexer = agg_counts.sum(1).argsort()
print(indexer[:10])
# tz
#                                   24
# Africa/Cairo                      20
# Africa/Casablanca                 21
# Africa/Ceuta                      92
# Africa/Johannesburg               87
# Africa/Lusaka                     53
# America/Anchorage                 54
# America/Argentina/Buenos_Aires    57
# America/Argentina/Cordoba         26
# America/Argentina/Mendoza         55
# dtype: int64


# 截取最后10⾏最⼤值：
count_subset = agg_counts.take(indexer[-10:])
print(count_subset)
# os                   Not Windows  Windows
# tz                                       
# America/Sao_Paulo           13.0     20.0
# Europe/Madrid               16.0     19.0
# Pacific/Honolulu             0.0     36.0
# Asia/Tokyo                   2.0     35.0
# Europe/London               43.0     31.0
# America/Denver             132.0     59.0
# America/Los_Angeles        130.0    252.0
# America/Chicago            115.0    285.0
#                            245.0    276.0
# America/New_York           339.0    912.0


# nlargest即n largest:最大的n个值
print(agg_counts.sum(1).nlargest(10))
# tz
# America/New_York       1251.0
#                         521.0
# America/Chicago         400.0
# America/Los_Angeles     382.0
# America/Denver          191.0
# Europe/London            74.0
# Asia/Tokyo               37.0
# Pacific/Honolulu         36.0
# Europe/Madrid            35.0
# America/Sao_Paulo        33.0
# dtype: float64

我们将数据制作成图:

# 重新排列数据以便绘图
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index()
print(count_subset[:10])
#                   tz           os  total
# 0  America/Sao_Paulo  Not Windows   13.0
# 1  America/Sao_Paulo      Windows   20.0
# 2      Europe/Madrid  Not Windows   16.0
# 3      Europe/Madrid      Windows   19.0
# 4   Pacific/Honolulu  Not Windows    0.0
# 5   Pacific/Honolulu      Windows   36.0
# 6         Asia/Tokyo  Not Windows    2.0
# 7         Asia/Tokyo      Windows   35.0
# 8      Europe/London  Not Windows   43.0
# 9      Europe/London      Windows   31.0

# 最常出现时区的Windows和非Windows⽤户
sns.barplot(x='total', y='tz', hue='os',  data=count_subset)

plt.show()

1557922486137

最常出现时区的Windows和⾮Windows⽤户的百分⽐

def norm_total(group):
    group['normed_total'] = group.total / group.total.sum()
    return group

results = count_subset.groupby('tz').apply(norm_total)

sns.barplot(x='normed_total', y='tz', hue='os',  data=results)

plt.show()

1557922581414

14.2 MovieLens 1M数据集

数据准备:

import pandas as pd

# 缩小显示
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
                      header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',
                        header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)


print(users[:5])
#    user_id gender  age  occupation    zip
# 0        1      F    1          10  48067
# 1        2      M   56          16  70072
# 2        3      M   25          15  55117
# 3        4      M   45           7  02460
# 4        5      M   25          20  55455

print(ratings[:5])
#    user_id  movie_id  rating  timestamp
# 0        1      1193       5  978300760
# 1        1       661       3  978302109
# 2        1       914       3  978301968
# 3        1      3408       4  978300275
# 4        1      2355       5  978824291

print(movies[:5])
#    movie_id              ...                                     genres
# 0         1              ...                Animation|Children's|Comedy
# 1         2              ...               Adventure|Children's|Fantasy
# 2         3              ...                             Comedy|Romance
# 3         4              ...                               Comedy|Drama
# 4         5              ...                                     Comedy

女性最喜欢的电影:

# merge函数将ratings跟users合并到⼀起,然后再将movies也合并进去。
data = pd.merge(pd.merge(ratings, users), movies)
print(data.iloc[0])
# user_id                                            1
# movie_id                                        1193
# rating                                             5
# timestamp                                  978300760
# gender                                             F
# age                                                1
# occupation                                        10
# zip                                            48067
# title         One Flew Over the Cuckoo's Nest (1975)
# genres                                         Drama
# Name: 0, dtype: object


# 按性别计算每部电影的平均得分
mean_ratings = data.pivot_table('rating', index='title',
                                columns='gender', aggfunc='mean')
print(mean_ratings[:5])
# gender                                F         M
# title                                            
# $1,000,000 Duck (1971)         3.375000  2.761905
# 'Night Mother (1986)           3.388889  3.352941
# 'Til There Was You (1997)      2.675676  2.733333
# 'burbs, The (1989)             2.793478  2.962085
# ...And Justice for All (1979)  3.828571  3.689024


ratings_by_title = data.groupby('title').size()
print(ratings_by_title[:10])
# title
# $1,000,000 Duck (1971)                37
# 'Night Mother (1986)                  70
# 'Til There Was You (1997)             52
# 'burbs, The (1989)                   303
# ...And Justice for All (1979)        199
# 1-900 (1994)                           2
# 10 Things I Hate About You (1999)    700
# 101 Dalmatians (1961)                565
# 101 Dalmatians (1996)                364
# 12 Angry Men (1957)                  616
# dtype: int64


# 过滤掉评分数据不够250条的电影
active_titles = ratings_by_title.index[ratings_by_title >= 250]
print(active_titles)
# Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
#        '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
#        '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
#        '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
#        '2010 (1984)',
#        ...
#        'X-Men (2000)', 'Year of Living Dangerously (1982)',
#        'Yellow Submarine (1968)', 'You've Got Mail (1998)',
#        'Young Frankenstein (1974)', 'Young Guns (1988)',
#        'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
#        'Zero Effect (1998)', 'eXistenZ (1999)'],
#       dtype='object', name='title', length=1216)


# 去掉评分数据不够250条的电影,再求平均值
mean_ratings = mean_ratings.loc[active_titles]
print(mean_ratings)
# gender                                    F         M
# title                                                
# 'burbs, The (1989)                 2.793478  2.962085
# 10 Things I Hate About You (1999)  3.646552  3.311966
# 101 Dalmatians (1961)              3.791444  3.500000
# 101 Dalmatians (1996)              3.240000  2.911215
# 12 Angry Men (1957)                4.184397  4.328421
# ...                                     ...       ...
# Young Guns (1988)                  3.371795  3.425620
# Young Guns II (1990)               2.934783  2.904025
# Young Sherlock Holmes (1985)       3.514706  3.363344
# Zero Effect (1998)                 3.864407  3.723140
# eXistenZ (1999)                    3.098592  3.289086
# [1216 rows x 2 columns]


# 分组排序(降序)
# 得出女性最喜欢的电影
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
print(top_female_ratings[:10])
# gender                                                     F         M
# title                                                                 
# Close Shave, A (1995)                               4.644444  4.473795
# Wrong Trousers, The (1993)                          4.588235  4.478261
# Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)       4.572650  4.464589
# Wallace & Gromit: The Best of Aardman Animation...  4.563107  4.385075
# Schindler's List (1993)                             4.562602  4.491415
# Shawshank Redemption, The (1994)                    4.539075  4.560625
# Grand Day Out, A (1992)                             4.537879  4.293255
# To Kill a Mockingbird (1962)                        4.536667  4.372611
# Creature Comforts (1990)                            4.513889  4.272277
# Usual Suspects, The (1995)                          4.513317  4.518248

计算评分分歧

假设我们想要找出男性和⼥性观众分歧最⼤的电影。⼀个办法是给mean_ratings加上⼀个⽤于存放平均得分之差的列，并对其进⾏排序：

mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')
print(sorted_by_diff[:10])
# gender                                        F         M      diff
# title                                                              
# Dirty Dancing (1987)                   3.790378  2.959596 -0.830782
# Jumpin' Jack Flash (1986)              3.254717  2.578358 -0.676359
# Grease (1978)                          3.975265  3.367041 -0.608224
# Little Women (1994)                    3.870588  3.321739 -0.548849
# Steel Magnolias (1989)                 3.901734  3.365957 -0.535777
# Anastasia (1997)                       3.800000  3.281609 -0.518391
# Rocky Horror Picture Show, The (1975)  3.673016  3.160131 -0.512885
# Color Purple, The (1985)               4.158192  3.659341 -0.498851
# Age of Innocence, The (1993)           3.827068  3.339506 -0.487561
# Free Willy (1993)                      2.921348  2.438776 -0.482573

求标准差:

# 按标题分组的评级标准差
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
# 降序排列
print(rating_std_by_title.sort_values(ascending=False)[:10])
# title
# Dumb & Dumber (1994)                     1.321333
# Blair Witch Project, The (1999)          1.316368
# Natural Born Killers (1994)              1.307198
# Tank Girl (1995)                         1.277695
# Rocky Horror Picture Show, The (1975)    1.260177
# Eyes Wide Shut (1999)                    1.259624
# Evita (1996)                             1.253631
# Billy Madison (1995)                     1.249970
# Fear and Loathing in Las Vegas (1998)    1.246408
# Bicentennial Man (1999)                  1.245533
# Name: rating, dtype: float64

利用Python进行数据分析11_第14章_数据分析案例

第14章数据分析案例

14.1 来⾃Bitly的USA.gov数据

14.2 MovieLens 1M数据集

14.3 1880-2010年间全美婴⼉姓名

利用Python进行数据分析11_第14章_数据分析案例

第14章 数据分析案例

14.1 来⾃Bitly的USA.gov数据

14.2 MovieLens 1M数据集

14.3 1880-2010年间全美婴⼉姓名

第14章数据分析案例