#==============================================================================
# 第2章 了解数据 15
# 2.1 简介 16
# 2.2 从CSV文件导入数据 16
#==============================================================================
#使用$head some_file.csv
import csv
filename = 'ch02-data.csv'
data = []
try:
#不必担心在操作完资源之后去关闭数据文件,with语句的上下文管理器会帮助处理。
with open(filename) as f:
reader = csv.reader(f)
c = 0
for row in reader:
if c == 0:
header = row
else:
data.append(row)
c += 1
except csv.Error as e:
print "Error reading CSV file at line %s: %s" % (reader.line_num, e)
sys.exit(-1)
if header:
print header
print '=================='
for datarow in data:
print datarow
--------------------------------------------------------------------------
加载大数据文件,
import numpy
#速度较快
data=numpy.loadtxt('ch02-data.csv',dtype='string',delimiter=',')
for datarow in data:
print datarow
#更好的处理缺失数据
data=numpy.genfromtxt('ch02-data.csv',dtype='string',delimiter=',')
for datarow in data:
print datarow
#==============================================================================
# 2.3 从Microsoft Excel文件中导入数据 18
#==============================================================================
#读写操作的支持是通过不同模块实现的,跨平台!
$ mkvirtualenv xlrdexample
(xlrdexample)$ pip install xlrd
#在指定工作簿中的工作表中,根据行数nrows和列数ncols读取单元格的内容:
#xlrd模块按照需要,仅加载文件的部分内容到内存中
import xlrd
from xlrd.xldate import XLDateAmbiguous
file = 'ch02-xlsxdata.xlsx'
wb = xlrd.open_workbook(filename=file)
#
wb = xlrd.open_workbook(filename=file,on_demand=True)
ws = wb.sheet_by_name('Sheet1')
dataset = []
for r in ws.nrows:
col = []
for c in ws.ncols:
col.append(ws.cell(r, c).value)
#if ws.cell_type(r, c) == xlrd.XL_CELL_DATE:
# try:
# print ws.cell_type(r, c)
# from datetime import datetime
# date_value = xlrd.xldate_as_tuple(ws.cell(r, c).value, wb.datemode)
# print datetime(*date_value)
# except XLDateAmbiguous as e:
# print e
dataset.append(col)
#from pprint import pprint
#pprint(dataset)
#==============================================================================
# 2.4 从定宽数据文件导入数据 21
#==============================================================================
#性能更重要,或者要解析的文件非常大时,使用Python中的struct模块
import struct
import string
mask='9s14s5s'
parse = struct.Struct(mask).unpack_from
print 'formatstring {!r}, record size: {}'
.format(mask, struct.calcsize(mask))
datafile = 'ch02-fixed-width-1M.data'
with open(datafile, 'r') as f:
for line in f:
fields = parse(line)
print 'fields: ', [field.strip() for field in fields]
#==============================================================================
# 2.5 从制表符分隔的文件中读取数据 23
#==============================================================================
"day" "ammount"
2013-01-24 323
2013-01-25 233
2013-01-26 433
2013-01-27 555
2013-01-28 123
2013-01-29 0
2013-01-30 221
import struct
import string
mask='9s14s5s'
parse = struct.Struct(mask).unpack_from
print 'formatstring {!r}, record size: {}' .format(mask, struct.calcsize(mask))
datafile = 'ch02-fixed-width-1M.data'
with open(datafile, 'r') as f:
for line in f:
fields = parse(line)
print 'fields: ', [field.strip() for field in fields]
------------------------------------------------------------------
"day" "ammount"
2013-01-24 323
2013-01-25 233
2013-01-26 433
2013-01-27 555
2013-01-28 123
2013-01-29 0
2013-01-30 221
datafile = 'ch02-data-dirty.tab'
with open(datafile, 'r') as f:
for line in f:
# remove next comment to see line before cleanup
# print 'DIRTY: ', line.split('\t')
# we remove any space in line start or end
line = line.strip()
# now we split the line by tab delimiter
print line.split('\t')
------------------------------------------------------------------
#==============================================================================
# 2.6 从JSON数据源导入数据 24
#==============================================================================
$pip install requests
import requests
import json
url = 'https://github.com/timeline.json'
r = requests.get(url)
json_obj = r.json()
print json.dumps(json_obj,sort_keys=True,indent=4)
data1 = {'b':789,'c':456,'a':123}
print json.dumps(data1,sort_keys=True,indent=4)
=============================
1.对简单数据类型的encoding 和 decoding:
使用简单的json.dumps方法对简单数据类型进行编码.
在json的编码过程中,会存在从python原始类型向json类型的转化过程:
Python -->>>>>--- JSON -->>>>>--- Python
dict object dict
list,tuple array list
str,unicode string unicode
int,long,float number (int) int,long
number (real) float
True true True
False false Flase
None null None
#coding=utf-8
import json
from decimal import Decimal
obj = [[1,2,3],123,123.123,'abc',{'key1':(1,2,3),'key2':(4,5,6)}]
encodedjson = json.dumps(obj)
print repr(obj)
print encodedjson
#[[1, 2, 3], 123, 123.123, 'abc', {'key2': (4, 5, 6), 'key1': (1, 2, 3)}]
#[[1, 2, 3], 123, 123.123, "abc", {"key2": [4, 5, 6], "key1": [1, 2, 3]}]
------------------------------------------------------------------------------
json.dumps方法提供了很多好用的参数可供选择,
比较常用的有sort_keys(对dict对象进行排序,我们知道默认dict是无序存放的),
separators,indent等参数。
data1 = {'b':789,'c':456,'a':123}
data2 = {'a':123,'b':789,'c':456}
d1 = json.dumps(data1,sort_keys=True)
print d1 #{"a": 123, "b": 789, "c": 456}
d2 = json.dumps(data2)
print d2 #{"a": 123, "c": 456, "b": 789}
d3 = json.dumps(data2,sort_keys=True)
print d3 #{"a": 123, "b": 789, "c": 456}
print d1==d2 False
print d1==d3 True
#indent参数是缩进的意思,它可以使得数据存储的格式变得更加优雅。
print json.dumps(data1,sort_keys=True,indent=4)
#{
# "a": 123,
# "b": 789,
# "c": 456
#}
------------------------------------------------------------------------
loads方法返回了原始的对象,但是仍然发生了一些数据类型的转化。例‘abc’转化为了unicode类型
decodejson = json.loads(encodedjson)
print type(decodejson)
print decodejson[4]['key1']
print decodejson
#<type 'list'>
#[1, 2, 3]
#[[1, 2, 3], 123, 123.123, u'abc', {u'key2': (4, 5, 6), u'key1': (1, 2, 3)}]
jstring='{"name":"xue","price":12.50}'
decodejson=json.loads(jstring,parse_float=Decimal)
print decodejson
#{u'price':Decimal(12.50),u'name':u'xue'}
------------------------------------------------------------------------
json主要是作为一种数据通信的格式存在的,
而网络通信是很在乎数据的大小的,无用的空格会占据很多通信带宽,所以适当时候也要对数据进行压缩。
separator参数可以起到这样的作用,该参数传递是一个元组,包含分割对象的字符串。
#coding=utf-8
import json
data={'a': 123, 'c': 456, 'b': 789}
#DATA: {'a': 123, 'c': 456, 'b': 789}
print 'repr(data) :', len(repr(data))
#repr(data): 30
print 'dumps(data) :', len(json.dumps(data))
#dumps(data): 30
print 'dumps(data, indent=2):', len(json.dumps(data, indent=4))
#dumps(data, indent=2) : 46
print 'dumps(data, separators):', len(json.dumps(data, separators=(',',':')))
#dumps(data, separators): 25
print json.dumps(data, separators=(',',':'))
#{"a":123,"c":456,"b":789}
------------------------------------------------------------------------
另一个比较有用的dumps参数是skipkeys,默认为False。 dumps方法存储dict对象时,key必须是str类型,如果出现了其他类型的话,那么会产生TypeError异常,
如果开启该参数,设为True的话,则会比较优雅的过度。
data = {'b':789,'c':456,(1,2):123}
print json.dumps(data,skipkeys=True)
#{"c": 456, "b": 789}
------------------------------------------------------------------------
=======================
2.处理自己的数据类型
#方法二:继承JSONEncoder和JSONDecoder类,覆写相关方法
import json
class Person(object):
def __init__(self,name,age):
self.name = name;
self.age = age;
def __repr__(self):
print (('Person Object name : %s , age:%d')%(self.name,self.age))
class MyEncoder(json.JSONEncoder):
def default(self,obj):
#convert object to a dict
d = {}
d['__class__'] = obj.__class__.__name__
d['__module__'] = obj.__module__
d.update(obj.__dict__)
return d
class MyDecoder(json.JSONDecoder):
def __init__(self):
json.JSONDecoder.__init__(self,object_hook=self.dict2object)
def dict2object(self,d):
#convert dict to object
if'__class__' in d:
class_name = d.pop('__class__')
module_name = d.pop('__module__')
module = __import__(module_name)
class_ = getattr(module,class_name)
args = dict((key.encode('ascii'), value) for key, value in d.items()) #get args
inst = class_(**args) #create new instance
else:
inst = d
return inst
if __name__ == '__main__':
p = Person('Peter',22)
p = Person('Peter',22)
#print p
d =MyEncoder().encode(p)
print d
o =MyDecoder().decode(d)
print type(o)
#==============================================================================
# 2.7 导出数据到JSON、CSV和Excel 27
#==============================================================================
#Excel读操作
pip install xlwt
def import_data(import_file):
mask = '9s14s5s'
data = []
parse = struct.Struct(mask).unpack_from
i =1
with open(import_file, 'r') as f:
for line in f:
fields = parse(line)
data.append(list([f.strip() for f in fields]))
i=i+4
if i >100:
return data
return data
-----------------------------------------------------------------
def write_data(data, export_format):
if export_format == 'csv':
return write_csv(data)
elif export_format == 'json':
return write_json(data)
elif export_format == 'xlsx':
return write_xlsx(data)
else:
raise Exception("Illegal format defined")
-----------------------------------------------------------------
def write_csv(data):
writer = csv.writer(file('export_csv.csv', 'wb'))
for row in data:
writer.writerow(row)
return "export_csv.csv"
-----------------------------------------------------------------
def write_json(data):
with open("export_json.json", "a") as file_obj:
file_obj.write(json.dumps(data))
return "export_json.json"
-----------------------------------------------------------------
def write_xlsx(data):
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet("Sheet 1")
row = 0
for line in data:
col = 0
for datum in line:
sheet1.write(row, col, datum)
col += 1
row += 1
# We have hard limit here of 65535 rows
# that we are able to save in spreadsheet.
if row > 65535:
print >> sys.stderr, "Hit limit of # of rows in one sheet (65535)."
break
book.save("export_xlsx.xls")
return "export_xlsx.xls"
#==============================================================================
# 2.8 从数据库导入数据 31
#==============================================================================
$pip install sqlite3
import sqlite3
import sys
script_path = "/root/PycharmProjects/demo/world.sql"
db ='/root/PycharmProjects/demo/world.db'
# if DB is not defined ,create memory database
#db = ":memory:"
try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
with open(script_path,'rb') as f:
cur.executescript(f.read())
print("Finish")
except sqlite3.Error as err:
print "Error occured: %s" % err
-----------------------------------------------------------------
import sqlite3
import sys
db = "/root/PycharmProjects/demo/world.db"
try:
con = sqlite3.connect(db)
with con:
cur = con.cursor()
query = 'SELECT ID, Name, Population FROM City ORDER BY Population DESC LIMIT 1000'
con.text_factory = str
cur.execute(query)
resultset = cur.fetchall()
# extract column names
col_names = [cn[0] for cn in cur.description]
print "%10s %30s %10s" % tuple(col_names)
print "="*(10+1+30+1+10)
for row in resultset:
print "%10s %30s %10s" % row
except sqlite3.Error as err:
print "[ERROR]:", err
#==============================================================================
# 2.9 清理异常值 36
#==============================================================================
yum install libpng
pip install matplotlib
#==============================================================================
# 2.10 读取大块数据文件 42
#==============================================================================
#即使相当大的文件也可轻松操作(按需加载)
with open(bigfile_path,'r') as bigfile:
for line in bigfile
#...
#按文件块依次读取,而不需要将整个文件读取到内存中
import sys
filename = "ch02-fixed-width-1M.data"
with open(filename, 'rb') as hugefile:
chunksize = 1000
readable = ''
# if you want to stop after certain number of blocks
# put condition in the while
while hugefile:
start = hugefile.tell()
for _ in range(start, start + chunksize):
file_block= hugefile.next()
print file_block
readable = readable + file_block
stop = hugefile.tell()
print ('readable %s')%(readable)
print 'reading bytes from %s to %s' % (start, stop)
print 'read bytes total:', len(readable)
raw_input()
#大文件的读取其他方案
#并行方法,如MapReduce
#多进程处理,
#==============================================================================
# 2.11 读取流数据源 44
#==============================================================================
##读取一个实时变化的文件
#应用:例如输入是一个类文件对象或者一个远程HTTP资源,
#就可以从远程服务读取输入信息,并持续地解析它,然后实时地更新图表,或者更新到中间队列、缓冲或者数据库
#在更为复杂的数据管道中,需要启用消息队列。达到的连续数据会被放在队列里一段时间,然后才能被我们接收到。
import time
import os
import sys
with open('stream.data','r') as file:
# Move to the end of file
filesize = os.stat(filename)[6]
file.seek(filesize)
while True:
where = file.tell()
# try reading a line
line = file.readline()
# if empty, go back
if not line:
time.sleep(1)
file.seek(where)
else:
# , at the end prevents print to add newline, as readline()
# already read that.
print line,
----------------------------------------------------------------------------
#coding=utf-8
#第一个用来读取文件中的字节
def FileStream(filename):
try:
f = open(filename)
for line in f:
for byte in line:
yield byte
except StopIteration, e:
f.close()
return
#,第二个用来过滤流中的字节,
def FilterStream(source, condition):
try:
while True:
byte = source.next()
if condition(byte):
yield byte
except StopIteration, e:
return
#第三个将流进的数据打印出来。
def PrintStream(source):
try:
while True:
byte = source.next()
print byte
except StopIteration, e:
return
PrintStream(FilterStream(FileStream('stream.data'), str.islower))
#==============================================================================
# 2.12 导入图像数据到NumPy数组 46
#==============================================================================
$pip install scipy
#############Lena图
#SciPy将这幅图打包在了misc模块中,因此可以很简单的重用这幅图
import scipy.misc
import matplotlib.pyplot as plt
lena = scipy.misc.lena()
plt.gray()
plt.imshow(lena)
plt.colorbar()
plt.show()
print lena.shape #(512, 512)
print lena.max() #245
print lena.dtype #int32
##########PIL
pip install PIL --allow-external PIL --allow-unverified PIL
##########放大图像
import matplotlib.pyplot as plt
import scipy
import numpy
# because the image we loaded is RGB image,
# http://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale
bug = scipy.misc.imread('stinkbug.png')
# if you want to inspect the shape of the loaded image
# uncomment following line
print bug.shape #(375, 500, 3)
bug = bug[:,:,0]# convert to gray
# show original image
plt.figure()
plt.gray()
plt.subplot(121)
plt.imshow(bug)
# show 'zoomed' region
zbug = bug[100:350,140:350]#指定放大[100:350,140:350]之间的部分矩阵
plt.subplot(122)
plt.imshow(zbug)
plt.show()
###########对于大图像推荐使用numpy.memmap来做图像的内存映射
import numpy
image=numpy.memmap('stinkbug.png', dtype=int, mode='r+',shape=(375,500))
#专注于图像处理的专业软件包
scikit-image:http://scikit-image.org/
#==============================================================================
# 2.13 生成可控的随机数据集合 51
#==============================================================================
############生成一个简单的随机数样本
import pylab
import random
SAMPLE_SIZE = 100
random.seed() # seed random generator,if no argument provided,uses system current time
real_rand_vars = [] # store generated random values here
# we don't need iterator value, so we can put call it '_'
for _ in range(SAMPLE_SIZE):
new_value = random.random() # get next random value
real_rand_vars.append(new_value)
# create histogram from data in 10 buckets
pylab.hist(real_rand_vars, 10)
pylab.xlabel("Number range")
pylab.ylabel("Count")
# show figure
pylab.show()
############生成虚拟价格增长数据的时序图
import pylab
import random
# days to generate data for
duration = 100
mean_inc = 0.2 # mean value
std_dev_inc = 1.2 # standard deviation
x = range(duration) # time series
y = []
for i in x:
next_delta = random.normalvariate(mean_inc, std_dev_inc)
price_today += next_delta
y.append(price_today)
pylab.plot(x,y)
pylab.xlabel("Time")
pylab.ylabel("Value")
pylab.show()
###################不同的分布显示直方图
# coding: utf-8
import random
import matplotlib
import matplotlib.pyplot as plt
SAMPLE_SIZE = 1000
buckets = 100# histogram buckets
plt.figure()
matplotlib.rcParams.update({'font.size': 7})# we need to update font size just for this example
###[0.0, 1.0)之间随机分布
plt.subplot(621)
plt.xlabel("random.random")
res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.random())
plt.hist(res, buckets)
###均匀分布[a,b]
plt.subplot(622)
plt.xlabel("random.uniform")
a = 1,b = SAMPLE_SIZE,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.uniform(a, b))
plt.hist(res, buckets)
###三角形分布
plt.subplot(623)
plt.xlabel("random.triangular")
low = 1, high = SAMPLE_SIZE, res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.triangular(low, high))
plt.hist(res, buckets)
###beta分布
plt.subplot(624)
plt.xlabel("random.betavariate")
alpha = 1, beta = 10,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.betavariate(alpha, beta))
plt.hist(res, buckets)
###指数分布
plt.subplot(625)
plt.xlabel("random.expovariate")
lambd = 1.0 / ((SAMPLE_SIZE + 1) / 2.),res = []
for _ in xrange(1, SAMPLE_SIZE): res.append(random.expovariate(lambd))
plt.hist(res, buckets)
###gamma分布
# The probability distribution function is:
#Conditions on the parameters are alpha > 0 and beta > 0.
# x ** (alpha - 1) * math.exp(-x / beta)
# pdf(x) = --------------------------------------
# math.gamma(alpha) * beta ** alpha
plt.subplot(626)
plt.xlabel("random.gammavariate")
alpha = 1,beta = 10,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.gammavariate(alpha, beta))
plt.hist(res, buckets)
###对数正态分布Log normal distribution。mu is the mean, and sigma is the standard deviation.
plt.subplot(627)
plt.xlabel("random.lognormvariate")
mu = 1,sigma = 0.5,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.lognormvariate(mu, sigma))
plt.hist(res, buckets)
###正态分布 Normal distribution. mu is the mean, and sigma is the standard deviation.
plt.subplot(628)
plt.xlabel("random.normalvariate")
mu = 1,sigma = 0.5,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.normalvariate(mu, sigma))
plt.hist(res, buckets)
###帕累托分布,Pareto distribution. alpha is the shape parameter.
plt.subplot(629)
plt.xlabel("random.paretovariate")
alpha = 1,res = []
for _ in xrange(1, SAMPLE_SIZE):res.append(random.paretovariate(alpha))
plt.hist(res, buckets)
plt.tight_layout()
plt.show()
###########随机数
1)用seed初始化伪随机数生成器,这样random()方法就能生成相同的期望随机值。
比较有用,比预先生成随机数并保存到到文件中要好。
2)避免随机生成的序列重复,推荐使用random.SystemRandom,其底层使用os.urandom。
os.urandom提供更多熵源的访问,seed()和setstate()没有影响,则样本不可重现了。
例子:想要一些随机单词
#Linux,针对Unix系统
import random
with open('/usr/share/dict/words') as f:
words=f.readlines()
words=[w.rstrip() for w in words]
for w in random.sample(words,5):
print w
#Windows
http://norvig.com/big.txt
#==============================================================================
# 2.14 真实数据的噪声平滑处理 58
#==============================================================================