第14章 数据分析案例

14.1 来⾃Bitly的USA.gov数据

读取json数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import json

path = 'datasets/bitly_usagov/example.txt'

print(open(path).readline())
# { "a": "Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\
# /535.11 (KHTML, like Gecko) Chrome\/17.0.963.78 Safari\
# /535.11", "c": "US", "nk": 1, "tz": "America\/New_York",
# "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog",
# "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r":
# "http:\/\/www.facebook.com\/l\/7AQEFzjSi\/1.usa.gov\/wfLQtf", "u":
# "http:\/\/www.ncbi.nlm.nih.gov\/pubmed\/22415991", "t": 1331923247,
# "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }

with open(path) as f:
records = [json.loads(line) for line in f.readlines()]

print(records[0])
# {'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
# 'c': 'US', 'nk': 1, 'tz': 'America/New_York', 'gr': 'MA', 'g': 'A6qOVH', 'h': 'wfLQtf', 'l': 'orofrog',
# 'al': 'en-US,en;q=0.8', 'hh': '1.usa.gov', 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
# 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991', 't': 1331923247, 'hc': 1331822918, 'cy': 'Danvers',
# 'll': [42.576698, -70.954903]}


# records[0]就是一个字典
print(type(records[0]))
# <class 'dict'>

⽤纯Python代码对时区进⾏计数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import json

path = 'datasets/bitly_usagov/example.txt'
with open(path) as f:
records = [json.loads(line) for line in f.readlines()]


# 因为并不是所有的字典都有tz键,所以先判断一下
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
# 结果在存在空值
print(time_zones[:10])
# ['America/New_York', 'America/Denver', 'America/New_York',
# 'America/Sao_Paulo', 'America/New_York', 'America/New_York',
# 'Europe/Warsaw', '', '', '']


from collections import defaultdict

# 对序列计数
def get_counts(sequence):
# # values will initialize to 0
counts = defaultdict(int)
for x in sequence:
counts[x] += 1
return counts


counts = get_counts(time_zones)
print(counts['America/New_York'])
# 1251

# 对全部的时区计数
print(len(time_zones))
# 3440

# 得到前n位的时区及其计数值
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]

print(top_counts(counts))
# [(33, 'America/Sao_Paulo'), (35, 'Europe/Madrid'),...]


from collections import Counter
counts = Counter(time_zones)
print(counts.most_common(10))
# [('America/New_York', 1251), ('', 521), ...]

⽤pandas对时区进⾏计数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json

import pandas as pd


path = 'datasets/bitly_usagov/example.txt'
with open(path) as f:
records = [json.loads(line) for line in f.readlines()]


frame = pd.DataFrame(records)

print(frame.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 3560 entries, 0 to 3559
# Data columns (total 18 columns):
# _heartbeat_ 120 non-null float64
# a 3440 non-null object
# al 3094 non-null object
# c 2919 non-null object
# cy 2919 non-null object
# g 3440 non-null object
# gr 2919 non-null object
# h 3440 non-null object
# hc 3440 non-null float64
# hh 3440 non-null object
# kw 93 non-null object
# l 3440 non-null object
# ll 2919 non-null object
# nk 3440 non-null float64
# r 3440 non-null object
# t 3440 non-null float64
# tz 3440 non-null object
# u 3440 non-null object
# dtypes: float64(4), object(14)
# memory usage: 500.7+ KB


# 获取tx列的前10个.(对应之前代码:获取每个字典的tz键的值)
print(frame['tz'][:10])
# 0 America/New_York
# 1 America/Denver
# 2 America/New_York
# 3 America/Sao_Paulo
# 4 America/New_York
# 5 America/New_York
# 6 Europe/Warsaw
# 7
# 8
# 9
# Name: tz, dtype: object


# 对tz列根据值计数
tz_counts = frame['tz'].value_counts()
print(tz_counts[:10])
# America/New_York 1251
# 521
# America/Chicago 400
# America/Los_Angeles 382
# America/Denver 191
# Europe/London 74
# Asia/Tokyo 37
# Pacific/Honolulu 36
# Europe/Madrid 35
# America/Sao_Paulo 33
# Name: tz, dtype: int64


# 填充NA
clean_tz = frame['tz'].fillna('Missing')
# 空字符串改为Unkown
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

# 清洗后的数据
print(tz_counts[:10])
# America/New_York 1251
# Unknown 521
# America/Chicago 400
# America/Los_Angeles 382
# America/Denver 191
# Missing 120
# Europe/London 74
# Asia/Tokyo 37
# Pacific/Honolulu 36
# Europe/Madrid 35
# Name: tz, dtype: int64

我们可以⽤matplotlib可视化这个数据

1
2
3
4
5
6
7
8
import seaborn as sns
import matplotlib.pyplot as plt


subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)

plt.show()

1557920958047

继续做后续处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# 将a列截短一点
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
# 0 Mozilla/5.0
# 1 GoogleMaps/RochesterNY
# 2 Mozilla/4.0
# 3 Mozilla/5.0
# 4 Mozilla/5.0
# dtype: object

print(results.value_counts()[:8])
# Mozilla/5.0 2594
# Mozilla/4.0 601
# GoogleMaps/RochesterNY 121
# Opera/9.80 34
# TEST_INTERNET_AGENT 24
# GoogleProducer 21
# Mozilla/6.0 5
# BlackBerry8520/5.0.0.681 4
# dtype: int64

import numpy as np

# 丢弃a列为空的行
cframe = frame[frame.a.notnull()]

# 按Windows和⾮Windows⽤户添加一个os列
cframe['os'] = np.where(cframe['a'].str.contains('Windows'),'Windows', 'Not Windows')
print(cframe['os'][:5])
# 0 Windows
# 1 Not Windows
# 2 Windows
# 3 Not Windows
# 4 Windows
# Name: os, dtype: object

# 按时区和系统分组操作
by_tz_os = cframe.groupby(['tz', 'os'])

# 使用size计数
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])
# os Not Windows Windows
# tz
# 245.0 276.0
# Africa/Cairo 0.0 3.0
# Africa/Casablanca 0.0 1.0
# Africa/Ceuta 0.0 2.0
# Africa/Johannesburg 0.0 1.0
# Africa/Lusaka 0.0 1.0
# America/Anchorage 4.0 1.0
# America/Argentina/Buenos_Aires 1.0 0.0
# America/Argentina/Cordoba 0.0 1.0
# America/Argentina/Mendoza 0.0 1.0


# 排序(升序)
indexer = agg_counts.sum(1).argsort()
print(indexer[:10])
# tz
# 24
# Africa/Cairo 20
# Africa/Casablanca 21
# Africa/Ceuta 92
# Africa/Johannesburg 87
# Africa/Lusaka 53
# America/Anchorage 54
# America/Argentina/Buenos_Aires 57
# America/Argentina/Cordoba 26
# America/Argentina/Mendoza 55
# dtype: int64


# 截取最后10⾏最⼤值:
count_subset = agg_counts.take(indexer[-10:])
print(count_subset)
# os Not Windows Windows
# tz
# America/Sao_Paulo 13.0 20.0
# Europe/Madrid 16.0 19.0
# Pacific/Honolulu 0.0 36.0
# Asia/Tokyo 2.0 35.0
# Europe/London 43.0 31.0
# America/Denver 132.0 59.0
# America/Los_Angeles 130.0 252.0
# America/Chicago 115.0 285.0
# 245.0 276.0
# America/New_York 339.0 912.0


# nlargest即n largest:最大的n个值
print(agg_counts.sum(1).nlargest(10))
# tz
# America/New_York 1251.0
# 521.0
# America/Chicago 400.0
# America/Los_Angeles 382.0
# America/Denver 191.0
# Europe/London 74.0
# Asia/Tokyo 37.0
# Pacific/Honolulu 36.0
# Europe/Madrid 35.0
# America/Sao_Paulo 33.0
# dtype: float64

我们将数据制作成图:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 重新排列数据以便绘图
count_subset = count_subset.stack()
count_subset.name = 'total'
count_subset = count_subset.reset_index()
print(count_subset[:10])
# tz os total
# 0 America/Sao_Paulo Not Windows 13.0
# 1 America/Sao_Paulo Windows 20.0
# 2 Europe/Madrid Not Windows 16.0
# 3 Europe/Madrid Windows 19.0
# 4 Pacific/Honolulu Not Windows 0.0
# 5 Pacific/Honolulu Windows 36.0
# 6 Asia/Tokyo Not Windows 2.0
# 7 Asia/Tokyo Windows 35.0
# 8 Europe/London Not Windows 43.0
# 9 Europe/London Windows 31.0

# 最常出现时区的Windows和非Windows⽤户
sns.barplot(x='total', y='tz', hue='os', data=count_subset)

plt.show()

1557922486137

最常出现时区的Windows和⾮Windows⽤户的百分⽐

1
2
3
4
5
6
7
8
9
def norm_total(group):
group['normed_total'] = group.total / group.total.sum()
return group

results = count_subset.groupby('tz').apply(norm_total)

sns.barplot(x='normed_total', y='tz', hue='os', data=results)

plt.show()

1557922581414


14.2 MovieLens 1M数据集

数据准备:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd

# 缩小显示
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('datasets/movielens/users.dat', sep='::',
header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',
header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('datasets/movielens/movies.dat', sep='::',
header=None, names=mnames)


print(users[:5])
# user_id gender age occupation zip
# 0 1 F 1 10 48067
# 1 2 M 56 16 70072
# 2 3 M 25 15 55117
# 3 4 M 45 7 02460
# 4 5 M 25 20 55455

print(ratings[:5])
# user_id movie_id rating timestamp
# 0 1 1193 5 978300760
# 1 1 661 3 978302109
# 2 1 914 3 978301968
# 3 1 3408 4 978300275
# 4 1 2355 5 978824291

print(movies[:5])
# movie_id ... genres
# 0 1 ... Animation|Children's|Comedy
# 1 2 ... Adventure|Children's|Fantasy
# 2 3 ... Comedy|Romance
# 3 4 ... Comedy|Drama
# 4 5 ... Comedy

女性最喜欢的电影:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# merge函数将ratings跟users合并到⼀起,然后再将movies也合并进去。
data = pd.merge(pd.merge(ratings, users), movies)
print(data.iloc[0])
# user_id 1
# movie_id 1193
# rating 5
# timestamp 978300760
# gender F
# age 1
# occupation 10
# zip 48067
# title One Flew Over the Cuckoo's Nest (1975)
# genres Drama
# Name: 0, dtype: object


# 按性别计算每部电影的平均得分
mean_ratings = data.pivot_table('rating', index='title',
columns='gender', aggfunc='mean')
print(mean_ratings[:5])
# gender F M
# title
# $1,000,000 Duck (1971) 3.375000 2.761905
# 'Night Mother (1986) 3.388889 3.352941
# 'Til There Was You (1997) 2.675676 2.733333
# 'burbs, The (1989) 2.793478 2.962085
# ...And Justice for All (1979) 3.828571 3.689024


ratings_by_title = data.groupby('title').size()
print(ratings_by_title[:10])
# title
# $1,000,000 Duck (1971) 37
# 'Night Mother (1986) 70
# 'Til There Was You (1997) 52
# 'burbs, The (1989) 303
# ...And Justice for All (1979) 199
# 1-900 (1994) 2
# 10 Things I Hate About You (1999) 700
# 101 Dalmatians (1961) 565
# 101 Dalmatians (1996) 364
# 12 Angry Men (1957) 616
# dtype: int64


# 过滤掉评分数据不够250条的电影
active_titles = ratings_by_title.index[ratings_by_title >= 250]
print(active_titles)
# Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
# '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
# '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
# '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
# '2010 (1984)',
# ...
# 'X-Men (2000)', 'Year of Living Dangerously (1982)',
# 'Yellow Submarine (1968)', 'You've Got Mail (1998)',
# 'Young Frankenstein (1974)', 'Young Guns (1988)',
# 'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
# 'Zero Effect (1998)', 'eXistenZ (1999)'],
# dtype='object', name='title', length=1216)


# 去掉评分数据不够250条的电影,再求平均值
mean_ratings = mean_ratings.loc[active_titles]
print(mean_ratings)
# gender F M
# title
# 'burbs, The (1989) 2.793478 2.962085
# 10 Things I Hate About You (1999) 3.646552 3.311966
# 101 Dalmatians (1961) 3.791444 3.500000
# 101 Dalmatians (1996) 3.240000 2.911215
# 12 Angry Men (1957) 4.184397 4.328421
# ... ... ...
# Young Guns (1988) 3.371795 3.425620
# Young Guns II (1990) 2.934783 2.904025
# Young Sherlock Holmes (1985) 3.514706 3.363344
# Zero Effect (1998) 3.864407 3.723140
# eXistenZ (1999) 3.098592 3.289086
# [1216 rows x 2 columns]


# 分组排序(降序)
# 得出女性最喜欢的电影
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
print(top_female_ratings[:10])
# gender F M
# title
# Close Shave, A (1995) 4.644444 4.473795
# Wrong Trousers, The (1993) 4.588235 4.478261
# Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
# Wallace & Gromit: The Best of Aardman Animation... 4.563107 4.385075
# Schindler's List (1993) 4.562602 4.491415
# Shawshank Redemption, The (1994) 4.539075 4.560625
# Grand Day Out, A (1992) 4.537879 4.293255
# To Kill a Mockingbird (1962) 4.536667 4.372611
# Creature Comforts (1990) 4.513889 4.272277
# Usual Suspects, The (1995) 4.513317 4.518248

计算评分分歧

假设我们想要找出男性和⼥性观众分歧最⼤的电影。⼀个办法是给mean_ratings加上⼀个⽤于存放平均得分之差的列,并对其进⾏排序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')
print(sorted_by_diff[:10])
# gender F M diff
# title
# Dirty Dancing (1987) 3.790378 2.959596 -0.830782
# Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359
# Grease (1978) 3.975265 3.367041 -0.608224
# Little Women (1994) 3.870588 3.321739 -0.548849
# Steel Magnolias (1989) 3.901734 3.365957 -0.535777
# Anastasia (1997) 3.800000 3.281609 -0.518391
# Rocky Horror Picture Show, The (1975) 3.673016 3.160131 -0.512885
# Color Purple, The (1985) 4.158192 3.659341 -0.498851
# Age of Innocence, The (1993) 3.827068 3.339506 -0.487561
# Free Willy (1993) 2.921348 2.438776 -0.482573

求标准差:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 按标题分组的评级标准差
rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
# 降序排列
print(rating_std_by_title.sort_values(ascending=False)[:10])
# title
# Dumb & Dumber (1994) 1.321333
# Blair Witch Project, The (1999) 1.316368
# Natural Born Killers (1994) 1.307198
# Tank Girl (1995) 1.277695
# Rocky Horror Picture Show, The (1975) 1.260177
# Eyes Wide Shut (1999) 1.259624
# Evita (1996) 1.253631
# Billy Madison (1995) 1.249970
# Fear and Loathing in Las Vegas (1998) 1.246408
# Bicentennial Man (1999) 1.245533
# Name: rating, dtype: float64

14.3 1880-2010年间全美婴⼉姓名