第六章:数据编码和处理

6.1 读写 CSV 数据

1
2
3
4
5
6
7
8
9
10
import csv

with open('stocks.csv') as f:
f_csv = csv.reader(f)
# 跳过首行(表头)
headers = next(f_csv)
for row in f_csv:
print(row)
# ['AA', '39.48', '6/11/2007', '9:36am', '-0.18', '181800']
# ['AIG', '71.38', '6/11/2007', '9:36am', '-0.15', '195500']

经常使用的方式 : 使用命名元组

1
2
3
4
5
6
7
8
9
10
11
12
import csv
from collections import namedtuple

with open('stocks.csv') as f:
f_csv = csv.reader(f)
# 使用表头作为命名元组的属性
Row = namedtuple('Row', next(f_csv))
for r in f_csv:
row = Row(*r)
print(row)
# Row(Symbol='AA', Price='39.48', Date='6/11/2007', Time='9:36am', Change='-0.18', Volume='181800')
# Row(Symbol='AIG', Price='71.38', Date='6/11/2007', Time='9:36am', Change='-0.15', Volume='195500')

当然,最经常使用的还是将csv.reader阅读器改为csv.DictReader:

1
2
3
4
5
6
7
8
import csv

with open('stocks.csv') as f:
f_csv = csv.DictReader(f)
for row in f_csv:
print(row)
# OrderedDict([('Symbol', 'AA'), ('Price', '39.48'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.18'), ('Volume', '181800')])
# OrderedDict([('Symbol', 'AIG'), ('Price', '71.38'), ('Date', '6/11/2007'), ('Time', '9:36am'), ('Change', '-0.15'), ('Volume', '195500')])

阅读需要阅读器(reader),写入自然也需要写入器(writer)

1
2
3
4
5
6
7
8
9
10
11
12
import csv

headers = ['Symbol','Price','Date','Time','Change','Volume']
rows = [('AA', 39.48, '6/11/2007', '9:36am', -0.18, 181800),
('AIG', 71.38, '6/11/2007', '9:36am', -0.15, 195500),
('AXP', 62.58, '6/11/2007', '9:36am', -0.46, 935000),
]

with open('my.csv','w') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(rows)

同样也有DictWriter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import csv

headers = ['Symbol', 'Price', 'Date', 'Time', 'Change', 'Volume']

rows = [{'Symbol':'AA', 'Price':39.48, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.18, 'Volume':181800},

{'Symbol':'AIG', 'Price': 71.38, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.15, 'Volume': 195500},

{'Symbol':'AXP', 'Price': 62.58, 'Date':'6/11/2007',
'Time':'9:36am', 'Change':-0.46, 'Volume': 935000},
]

with open('my.csv','w') as f:
f_csv = csv.DictWriter(f,headers)
f_csv.writeheader()
f_csv.writerows(rows)

csv 产生的数据都是字符串类型的,它不会做任何其他类型的转换。
如果你需要做这样的类型转换,你必须自己手动去实现。

1
2
3
4
5
6
7
8
9
col_types = [str, float, str, str, float, int]

with open('stocks.csv') as f:
f_csv = csv.reader(f)
headers = next(f_csv)
for row in f_csv:
# Apply conversions to the row items
row = tuple(convert(value) for convert, value in zip(col_types, row))
print(row)
1
2
3
4
5
6
7
8
9
field_types = [ ('Price', float),
('Change', float),
('Volume', int) ]

with open('stocks.csv') as f:
for row in csv.DictReader(f):
row.update((key, conversion(row[key]))
for key, conversion in field_types)
print(row)

6.2 读写 JSON 数据

1
2
3
4
5
6
7
8
9
10
import json

data = {
'name' : 'ACME',
'shares' : 100,
'price' : 542.23
}

json_str = json.dumps(data)
data = json.loads(json_str)

在 web 应用程序中,顶层对象最好是一个字典

我们在获取嵌套json数据的时候可以使用pprint,使打印更美观

pprint模块 – pretty print , 美观打印

  • print()和pprint()功能基本一样,唯一的区别就是pprint()模块打印出来的数据结构更加完整,每行为一个数据结构,更加方便阅读打印输出结果。
  • 格式化工具会生成数据结构的一些表示,不仅可以由解释器正确地解析,而且便于人类阅读。输出尽可能放在一行上,分解为多行时则需要缩进。
1
2
3
4
5
6
7
8
9
10
11
from pprint import pprint

data = ("test", [1, 2, 3,'test', 4, 5], "This is a string!",{'age':23, 'gender':'F'})
print(data)
# ('test', [1, 2, 3, 'test', 4, 5], 'This is a string!', {'age': 23, 'gender': 'F'})

pprint(data)
# ('test',
# [1, 2, 3, 'test', 4, 5],
# 'This is a string!',
# {'age': 23, 'gender': 'F'})

我们还可以使用object_pairs_hook参数,迅速修改类型:

1
2
3
4
5
6
7
8
9
10
11
12
import json
from collections import OrderedDict

s = '{"name": "ACME", "shares": 50, "price": 490.1}'
data = json.loads(s)
print(data)
# {'name': 'ACME', 'shares': 50, 'price': 490.1}

# 使用object_pairs_hook将其转为OrderedDict
data = json.loads(s,object_pairs_hook=OrderedDict)
print(data)
# OrderedDict([('name', 'ACME'), ('shares', 50), ('price', 490.1)])

json是不支持序列化实例对象的,不过我们可以设计一个函数将实例对象的属性制作成字典,反序列化的时候再赋值回去

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json


def serialize_instance(obj):
d = { '__classname__' : type(obj).__name__ }
d.update(vars(obj))
return d


def unserialize_object(d):
clsname = d.pop('__classname__', None)
if clsname:
cls = classes[clsname]
obj = cls.__new__(cls) # Make instance without calling __init__
for key, value in d.items():
setattr(obj, key, value)
return obj
else:
return d


class Person(object):
def __init__(self, name):
self.name = name


classes = {
"Person": Person
}


me = Person('heyingliang')

j = json.dumps(me,default=serialize_instance)
print(j)
# {"__classname__": "Person", "name": "heyingliang"}

a = json.loads(j,object_hook=unserialize_object)
print(a)
# <__main__.Person object at 0x000001FF1750AC18>

print(a.name)
# heyingliang

6.3 解析简单的 XML 数据

使用 xml.etree.ElementTree 模块

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from urllib.request import urlopen
from xml.etree.ElementTree import parse


# Download the RSS feed and parse it
u = urlopen('http://planet.python.org/rss20.xml')
doc = parse(u)

# Extract and output tags of interest
for item in doc.iterfind('channel/item'):
title = item.findtext('title')
date = item.findtext('pubDate')
link = item.findtext('link')
print(title)
print(date)
print(link)
print()
  • doc代表根标签
  • doc.iterfind(‘channel/item’)表示 : 寻找channel标签下的item标签

有两种解析器 :

  • from xml.etree.ElementTree import parse
  • from xml.etree.cElementTree import parse

cElementTree 是C语言实现的,ElementTree 是python实现的.

尽量使用C语言版

1
2
3
4
try:
from xml.etree.cElementTree import parse
except ImportError as e:
from xml.etree.ElementTree import parse

6.4 增量式解析大型 XML 文件

使用迭代器和生成器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from xml.etree.ElementTree import iterparse


def parse_and_remove(filename, path):
path_parts = path.split('/')
doc = iterparse(filename, ('start', 'end'))
# Skip the root element
next(doc)

tag_stack = []
elem_stack = []
for event, elem in doc:
if event == 'start':
tag_stack.append(elem.tag)
elem_stack.append(elem)
elif event == 'end':
if tag_stack == path_parts:
yield elem
elem_stack[-2].remove(elem)
try:
tag_stack.pop()
elem_stack.pop()
except IndexError:
pass

# Find zip code with most potholes

from collections import Counter
potholes_by_zip = Counter()


for pothole in parse_and_remove('potholes.xml', 'row/row'):
potholes_by_zip[pothole.findtext('zip')] += 1

for zipcode, num in potholes_by_zip.most_common():
print(zipcode, num)
  • iterparse() 方法允许对 XML 文档进行增量操作。
  • 使用时,你需要提供文件名和一个包含下面一种或多种类型的事件列表:start , end, start-ns 和 end-ns 。
  • 由 iterparse() 创建的迭代器会产生形如 (event, elem) 的元组,其中 event 是上述事件列表中的某一个,而 elem 是相应的 XML 元素。

6.5 将字典转换为 XML

1
2
3
4
5
6
7
8
9
10
11
12
from xml.etree.ElementTree import Element

def dict_to_xml(tag, d):
'''
Turn a simple dict of key/value pairs into XML
'''
elem = Element(tag)
for key, val in d.items():
child = Element(key)
child.text = str(val)
elem.append(child)
return elem

6.6 解析和修改 XML

1
from xml.etree.ElementTree import parse, Element

6.7 利用命名空间解析 XML 文档

1
2
3
4
5
6
7
8
9
10
11
class XMLNamespaces:
def __init__(self, **kwargs):
self.namespaces = {}
for name, uri in kwargs.items():
self.register(name, uri)

def register(self, name, uri):
self.namespaces[name] = '{'+uri+'}'

def __call__(self, path):
return path.format_map(self.namespaces)

6.8 与关系型数据库的交互

sqlite3 模块

  1. 连接到数据库 : connect() 函数
  2. 创建一个游标 : .cursor() 函数
  3. 执行语句 : execute() 函数
  4. 提交语句 : commit() 函数
1
2
3
4
5
import sqlite3

db = sqlite3.connect('database.db')
c = db.cursor()
c.execute('create table portfolio (symbol text, shares integer, pricereal)')

6.9 编码和解码十六进制数

  • 将一个十六进制字符串解码成一个字节字符串
  • 将一个字节字符串编码成一个十六进制字符串。

binascii模块。

binascii : 二进制数据和asccii数据的互换

1
2
3
4
5
6
7
import binascii

s = b'hello'
h = binascii.b2a_hex(s)
print(h) # b'68656c6c6f'

print(binascii.a2b_hex(h)) # b'hello'

6.10 编码解码 Base64 数据

  • base64是一种用64个可打印字符编码任意二进制的方法。
  • Base64的出现是为了解决不同编码(ASCII、GBK、Unicode等)导致的问题。各国语言不一样,计算机系统中所使用的字符编码不一样
  • 说白了,和ascii,utf8一样,base64就是一种编码方案,

base64 模块中两个函数 b64encode() and b64decode()

1
2
3
4
5
6
import base64

s = b'hello'
a = base64.b64encode(s)
print(a) # b'aGVsbG8='
print(base64.b64decode(a)) # b'hello'

6.11 读写二进制数组数据

读写一个二进制数组的结构化数据到 Python 元组中

使用 struct 模块处理二进制数据

对于需要编码和解码二进制数据的程序而言,通常会使用 struct 模块。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from struct import Struct

# 将元组序列写入结构的二进制文件
def write_records(records, format, f):
record_struct = Struct(format)
for r in records:
f.write(record_struct.pack(*r))

if __name__ == '__main__':
records = [ (1, 2.3, 4.5),
(6, 7.8, 9.0),
(12, 13.4, 56.7) ]

with open('data.b', 'wb') as f:
write_records(records, '<idd', f)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
from struct import Struct

# 从二进制文件 以块状形式 读取出元组
def read_records(format, f):
record_struct = Struct(format)
chunks = iter(lambda: f.read(record_struct.size), b'')
# 返回生成器
return (record_struct.unpack(chunk) for chunk in chunks)


if __name__ == '__main__':
with open('data.b','rb') as f:
for rec in read_records('<idd', f):
pass
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from struct import Struct

# 将整个文件一次性读取到一个字节字符串
def unpack_records(format, data):
record_struct = Struct(format)
# 读取整个长度
return (record_struct.unpack_from(data, offset)
for offset in range(0, len(data), record_struct.size))


if __name__ == '__main__':
with open('data.b', 'rb') as f:
data = f.read()
for rec in unpack_records('<idd', data):
pass

6.12 读取嵌套和可变长二进制数据

struct 模块可被用来编码/解码几乎所有类型的二进制的数据结构。

很难,看不懂,略

6.13 数据的累加与统计操作

处理一个很大的数据集并需要计算数据总和或其他统计量。

Pandas 库