python数据处理2

#==============================================================================

# 第2章了解数据 15

# 2．1 简介 16

# 2．2 从CSV文件导入数据 16

#==============================================================================

#使用$head some_file.csv

import csv

filename = 'ch02-data.csv'

data = []

try:

#不必担心在操作完资源之后去关闭数据文件，with语句的上下文管理器会帮助处理。

with open(filename) as f:

reader = csv.reader(f)

c = 0

for row in reader:

if c == 0:

header = row

else:

data.append(row)

c += 1

except csv.Error as e:

print "Error reading CSV file at line %s: %s" % (reader.line_num, e)

sys.exit(-1)

if header:

print header

print '=================='

for datarow in data:

print datarow

--------------------------------------------------------------------------

加载大数据文件，

import numpy

#速度较快

data=numpy.loadtxt('ch02-data.csv',dtype='string',delimiter=',')

for datarow in data:

print datarow

#更好的处理缺失数据

data=numpy.genfromtxt('ch02-data.csv',dtype='string',delimiter=',')

for datarow in data:

print datarow

#==============================================================================

# 2．3 从Microsoft Excel文件中导入数据 18

#==============================================================================

#读写操作的支持是通过不同模块实现的，跨平台！

$ mkvirtualenv xlrdexample

(xlrdexample)$ pip install xlrd

#在指定工作簿中的工作表中，根据行数nrows和列数ncols读取单元格的内容:

#xlrd模块按照需要,仅加载文件的部分内容到内存中

import xlrd

from xlrd.xldate import XLDateAmbiguous

file = 'ch02-xlsxdata.xlsx'

wb = xlrd.open_workbook(filename=file)

wb = xlrd.open_workbook(filename=file,on_demand=True)

ws = wb.sheet_by_name('Sheet1')

dataset = []

for r in ws.nrows:

col = []

for c in ws.ncols:

col.append(ws.cell(r, c).value)

#if ws.cell_type(r, c) == xlrd.XL_CELL_DATE:

# try:

# print ws.cell_type(r, c)

# from datetime import datetime

# date_value = xlrd.xldate_as_tuple(ws.cell(r, c).value, wb.datemode)

# print datetime(*date_value)

# except XLDateAmbiguous as e:

# print e

dataset.append(col)

#from pprint import pprint

#pprint(dataset)

#==============================================================================

# 2．4 从定宽数据文件导入数据 21

#==============================================================================

#性能更重要，或者要解析的文件非常大时，使用Python中的struct模块

import struct

import string

mask='9s14s5s'

parse = struct.Struct(mask).unpack_from

print 'formatstring {!r}, record size: {}'

.format(mask, struct.calcsize(mask))

datafile = 'ch02-fixed-width-1M.data'

with open(datafile, 'r') as f:

for line in f:

fields = parse(line)

print 'fields: ', [field.strip() for field in fields]

#==============================================================================

# 2．5 从制表符分隔的文件中读取数据 23

#==============================================================================

"day" "ammount"

2013-01-24 323

2013-01-25 233

2013-01-26 433

2013-01-27 555

2013-01-28 123

2013-01-29 0

2013-01-30 221

import struct

import string

mask='9s14s5s'

parse = struct.Struct(mask).unpack_from

print 'formatstring {!r}, record size: {}' .format(mask, struct.calcsize(mask))

datafile = 'ch02-fixed-width-1M.data'

with open(datafile, 'r') as f:

for line in f:

fields = parse(line)

print 'fields: ', [field.strip() for field in fields]

------------------------------------------------------------------

"day" "ammount"

2013-01-24 323

2013-01-25 233

2013-01-26 433

2013-01-27 555

2013-01-28 123

2013-01-29 0

2013-01-30 221

datafile = 'ch02-data-dirty.tab'

with open(datafile, 'r') as f:

for line in f:

# remove next comment to see line before cleanup

# print 'DIRTY: ', line.split('\t')

# we remove any space in line start or end

line = line.strip()

# now we split the line by tab delimiter

print line.split('\t')

------------------------------------------------------------------

#==============================================================================

# 2．6 从JSON数据源导入数据 24

#==============================================================================

$pip install requests

import requests

import json

url = 'https://github.com/timeline.json'

r = requests.get(url)

json_obj = r.json()

print json.dumps(json_obj,sort_keys=True,indent=4)

data1 = {'b':789,'c':456,'a':123}

print json.dumps(data1,sort_keys=True,indent=4)

=============================

1.对简单数据类型的encoding 和 decoding：

使用简单的json.dumps方法对简单数据类型进行编码.

在json的编码过程中，会存在从python原始类型向json类型的转化过程:

Python -->>>>>--- JSON -->>>>>--- Python

dict object dict

list,tuple array list

str,unicode string unicode

int,long,float number (int) int,long

number (real) float

True true True

False false Flase

None null None

#coding=utf-8

import json

from decimal import Decimal

obj = [[1,2,3],123,123.123,'abc',{'key1':(1,2,3),'key2':(4,5,6)}]

encodedjson = json.dumps(obj)

print repr(obj)

print encodedjson

#[[1, 2, 3], 123, 123.123, 'abc', {'key2': (4, 5, 6), 'key1': (1, 2, 3)}]

#[[1, 2, 3], 123, 123.123, "abc", {"key2": [4, 5, 6], "key1": [1, 2, 3]}]

------------------------------------------------------------------------------

json.dumps方法提供了很多好用的参数可供选择，

比较常用的有sort_keys（对dict对象进行排序，我们知道默认dict是无序存放的），

separators，indent等参数。

data1 = {'b':789,'c':456,'a':123}

data2 = {'a':123,'b':789,'c':456}

d1 = json.dumps(data1,sort_keys=True)

print d1 #{"a": 123, "b": 789, "c": 456}

d2 = json.dumps(data2)

print d2 #{"a": 123, "c": 456, "b": 789}

d3 = json.dumps(data2,sort_keys=True)

print d3 #{"a": 123, "b": 789, "c": 456}

print d1==d2 False

print d1==d3 True

#indent参数是缩进的意思，它可以使得数据存储的格式变得更加优雅。

print json.dumps(data1,sort_keys=True,indent=4)

# "a": 123,

# "b": 789,

# "c": 456

------------------------------------------------------------------------

loads方法返回了原始的对象，但是仍然发生了一些数据类型的转化。例‘abc’转化为了unicode类型

decodejson = json.loads(encodedjson)

print type(decodejson)

print decodejson[4]['key1']

print decodejson

#<type 'list'>

#[1, 2, 3]

#[[1, 2, 3], 123, 123.123, u'abc', {u'key2': (4, 5, 6), u'key1': (1, 2, 3)}]

jstring='{"name":"xue","price":12.50}'

decodejson=json.loads(jstring,parse_float=Decimal)

print decodejson

#{u'price':Decimal(12.50),u'name':u'xue'}

------------------------------------------------------------------------

json主要是作为一种数据通信的格式存在的，

而网络通信是很在乎数据的大小的，无用的空格会占据很多通信带宽，所以适当时候也要对数据进行压缩。

separator参数可以起到这样的作用，该参数传递是一个元组，包含分割对象的字符串。

#coding=utf-8

import json

data={'a': 123, 'c': 456, 'b': 789}

#DATA: {'a': 123, 'c': 456, 'b': 789}

print 'repr(data) :', len(repr(data))

#repr(data): 30

print 'dumps(data) :', len(json.dumps(data))

#dumps(data): 30

print 'dumps(data, indent=2):', len(json.dumps(data, indent=4))

#dumps(data, indent=2) : 46

print 'dumps(data, separators):', len(json.dumps(data, separators=(',',':')))

#dumps(data, separators): 25

print json.dumps(data, separators=(',',':'))

#{"a":123,"c":456,"b":789}

------------------------------------------------------------------------

另一个比较有用的dumps参数是skipkeys，默认为False。 dumps方法存储dict对象时，key必须是str类型，如果出现了其他类型的话，那么会产生TypeError异常，

如果开启该参数，设为True的话，则会比较优雅的过度。

data = {'b':789,'c':456,(1,2):123}

print json.dumps(data,skipkeys=True)

#{"c": 456, "b": 789}

------------------------------------------------------------------------

=======================

2.处理自己的数据类型

#方法二：继承JSONEncoder和JSONDecoder类，覆写相关方法

import json

class Person(object):

def __init__(self,name,age):

self.name = name;

self.age = age;

def __repr__(self):

print (('Person Object name : %s , age:%d')%(self.name,self.age))

class MyEncoder(json.JSONEncoder):

def default(self,obj):

#convert object to a dict

d = {}

d['__class__'] = obj.__class__.__name__

d['__module__'] = obj.__module__

d.update(obj.__dict__)

return d

class MyDecoder(json.JSONDecoder):

def __init__(self):

json.JSONDecoder.__init__(self,object_hook=self.dict2object)

def dict2object(self,d):

#convert dict to object

if'__class__' in d:

class_name = d.pop('__class__')

module_name = d.pop('__module__')

module = __import__(module_name)

class_ = getattr(module,class_name)

args = dict((key.encode('ascii'), value) for key, value in d.items()) #get args

inst = class_(**args) #create new instance

else:

inst = d

return inst

if __name__ == '__main__':

p = Person('Peter',22)

#print p

d =MyEncoder().encode(p)

print d

o =MyDecoder().decode(d)

print type(o)

#==============================================================================

# 2．7 导出数据到JSON、CSV和Excel 27

#==============================================================================

#Excel读操作

pip install xlwt

def import_data(import_file):

mask = '9s14s5s'

data = []

parse = struct.Struct(mask).unpack_from

i =1

with open(import_file, 'r') as f:

for line in f:

fields = parse(line)

data.append(list([f.strip() for f in fields]))

i=i+4

if i >100:

return data

-----------------------------------------------------------------

def write_data(data, export_format):

if export_format == 'csv':

return write_csv(data)

elif export_format == 'json':

return write_json(data)

elif export_format == 'xlsx':

return write_xlsx(data)

else:

raise Exception("Illegal format defined")

-----------------------------------------------------------------

def write_csv(data):

writer = csv.writer(file('export_csv.csv', 'wb'))

for row in data:

writer.writerow(row)

return "export_csv.csv"

-----------------------------------------------------------------

def write_json(data):

with open("export_json.json", "a") as file_obj:

file_obj.write(json.dumps(data))

return "export_json.json"

-----------------------------------------------------------------

def write_xlsx(data):

from xlwt import Workbook

book = Workbook()

sheet1 = book.add_sheet("Sheet 1")

row = 0

for line in data:

col = 0

for datum in line:

sheet1.write(row, col, datum)

col += 1

row += 1

# We have hard limit here of 65535 rows

# that we are able to save in spreadsheet.

if row > 65535:

print >> sys.stderr, "Hit limit of # of rows in one sheet (65535)."

break

book.save("export_xlsx.xls")

return "export_xlsx.xls"

#==============================================================================

# 2．8 从数据库导入数据 31

#==============================================================================

$pip install sqlite3

import sqlite3

import sys

script_path = "/root/PycharmProjects/demo/world.sql"

db ='/root/PycharmProjects/demo/world.db'

# if DB is not defined ,create memory database

#db = ":memory:"

try:

con = sqlite3.connect(db)

with con:

cur = con.cursor()

with open(script_path,'rb') as f:

cur.executescript(f.read())

print("Finish")

except sqlite3.Error as err:

print "Error occured: %s" % err

-----------------------------------------------------------------

import sqlite3

import sys

db = "/root/PycharmProjects/demo/world.db"

try:

con = sqlite3.connect(db)

with con:

cur = con.cursor()

query = 'SELECT ID, Name, Population FROM City ORDER BY Population DESC LIMIT 1000'

con.text_factory = str

cur.execute(query)

resultset = cur.fetchall()

# extract column names

col_names = [cn[0] for cn in cur.description]

print "%10s %30s %10s" % tuple(col_names)

print "="*(10+1+30+1+10)

for row in resultset:

print "%10s %30s %10s" % row

except sqlite3.Error as err:

print "[ERROR]:", err

#==============================================================================

# 2．9 清理异常值 36

#==============================================================================

yum install libpng

pip install matplotlib

#==============================================================================

# 2．10 读取大块数据文件 42

#==============================================================================

#即使相当大的文件也可轻松操作（按需加载）

with open(bigfile_path,'r') as bigfile:

for line in bigfile

#...

#按文件块依次读取，而不需要将整个文件读取到内存中

import sys

filename = "ch02-fixed-width-1M.data"

with open(filename, 'rb') as hugefile:

chunksize = 1000

readable = ''

# if you want to stop after certain number of blocks

# put condition in the while

while hugefile:

start = hugefile.tell()

for _ in range(start, start + chunksize):

file_block= hugefile.next()

print file_block

readable = readable + file_block

stop = hugefile.tell()

print ('readable %s')%(readable)

print 'reading bytes from %s to %s' % (start, stop)

print 'read bytes total:', len(readable)

raw_input()

#大文件的读取其他方案

#并行方法，如MapReduce

#多进程处理，

#==============================================================================

# 2．11 读取流数据源 44

#==============================================================================

##读取一个实时变化的文件

#应用：例如输入是一个类文件对象或者一个远程HTTP资源，

#就可以从远程服务读取输入信息，并持续地解析它，然后实时地更新图表，或者更新到中间队列、缓冲或者数据库

#在更为复杂的数据管道中，需要启用消息队列。达到的连续数据会被放在队列里一段时间，然后才能被我们接收到。

import time

import os

import sys

with open('stream.data','r') as file:

# Move to the end of file

filesize = os.stat(filename)[6]

file.seek(filesize)

while True:

where = file.tell()

# try reading a line

line = file.readline()

# if empty, go back

if not line:

time.sleep(1)

file.seek(where)

else:

# , at the end prevents print to add newline, as readline()

# already read that.

print line,

----------------------------------------------------------------------------

#coding=utf-8

#第一个用来读取文件中的字节

def FileStream(filename):

try:

f = open(filename)

for line in f:

for byte in line:

yield byte

except StopIteration, e:

f.close()

return

#，第二个用来过滤流中的字节，

def FilterStream(source, condition):

try:

while True:

byte = source.next()

if condition(byte):

yield byte

except StopIteration, e:

return

#第三个将流进的数据打印出来。

def PrintStream(source):

try:

while True:

byte = source.next()

print byte

except StopIteration, e:

return

PrintStream(FilterStream(FileStream('stream.data'), str.islower))

#==============================================================================

# 2．12 导入图像数据到NumPy数组 46

#==============================================================================

$pip install scipy

#############Lena图

#SciPy将这幅图打包在了misc模块中，因此可以很简单的重用这幅图

import scipy.misc

import matplotlib.pyplot as plt

lena = scipy.misc.lena()

plt.gray()

plt.imshow(lena)

plt.colorbar()

plt.show()

print lena.shape #(512, 512)

print lena.max() #245

print lena.dtype #int32

##########PIL

pip install PIL --allow-external PIL --allow-unverified PIL

##########放大图像

import matplotlib.pyplot as plt

import scipy

import numpy

# because the image we loaded is RGB image,

# http://en.wikipedia.org/wiki/Grayscale#Converting_color_to_grayscale

bug = scipy.misc.imread('stinkbug.png')

# if you want to inspect the shape of the loaded image

# uncomment following line

print bug.shape #(375, 500, 3)

bug = bug[:,:,0]# convert to gray

# show original image

plt.figure()

plt.gray()

plt.subplot(121)

plt.imshow(bug)

# show 'zoomed' region

zbug = bug[100:350,140:350]#指定放大[100:350,140:350]之间的部分矩阵

plt.subplot(122)

plt.imshow(zbug)

plt.show()

###########对于大图像推荐使用numpy.memmap来做图像的内存映射

import numpy

image=numpy.memmap('stinkbug.png', dtype=int, mode='r+',shape=(375,500))

#专注于图像处理的专业软件包

scikit-image:http://scikit-image.org/

#==============================================================================

# 2．13 生成可控的随机数据集合 51

#==============================================================================

############生成一个简单的随机数样本

import pylab

import random

SAMPLE_SIZE = 100

random.seed() # seed random generator,if no argument provided,uses system current time

real_rand_vars = [] # store generated random values here

# we don't need iterator value, so we can put call it '_'

for _ in range(SAMPLE_SIZE):

new_value = random.random() # get next random value

real_rand_vars.append(new_value)

# create histogram from data in 10 buckets

pylab.hist(real_rand_vars, 10)

pylab.xlabel("Number range")

pylab.ylabel("Count")

# show figure

pylab.show()

############生成虚拟价格增长数据的时序图

import pylab

import random

# days to generate data for

duration = 100

mean_inc = 0.2 # mean value

std_dev_inc = 1.2 # standard deviation

x = range(duration) # time series

y = []

for i in x:

next_delta = random.normalvariate(mean_inc, std_dev_inc)

price_today += next_delta

y.append(price_today)

pylab.plot(x,y)

pylab.xlabel("Time")

pylab.ylabel("Value")

pylab.show()

###################不同的分布显示直方图

# coding: utf-8

import random

import matplotlib

import matplotlib.pyplot as plt

SAMPLE_SIZE = 1000

buckets = 100# histogram buckets

plt.figure()

matplotlib.rcParams.update({'font.size': 7})# we need to update font size just for this example

###[0.0, 1.0)之间随机分布

plt.subplot(621)

plt.xlabel("random.random")

res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.random())

plt.hist(res, buckets)

###均匀分布[a,b]

plt.subplot(622)

plt.xlabel("random.uniform")

a = 1,b = SAMPLE_SIZE,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.uniform(a, b))

plt.hist(res, buckets)

###三角形分布

plt.subplot(623)

plt.xlabel("random.triangular")

low = 1, high = SAMPLE_SIZE, res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.triangular(low, high))

plt.hist(res, buckets)

###beta分布

plt.subplot(624)

plt.xlabel("random.betavariate")

alpha = 1, beta = 10,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.betavariate(alpha, beta))

plt.hist(res, buckets)

###指数分布

plt.subplot(625)

plt.xlabel("random.expovariate")

lambd = 1.0 / ((SAMPLE_SIZE + 1) / 2.),res = []

for _ in xrange(1, SAMPLE_SIZE): res.append(random.expovariate(lambd))

plt.hist(res, buckets)

###gamma分布

# The probability distribution function is:

#Conditions on the parameters are alpha > 0 and beta > 0.

# x ** (alpha - 1) * math.exp(-x / beta)

# pdf(x) = --------------------------------------

# math.gamma(alpha) * beta ** alpha

plt.subplot(626)

plt.xlabel("random.gammavariate")

alpha = 1,beta = 10,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.gammavariate(alpha, beta))

plt.hist(res, buckets)

###对数正态分布Log normal distribution。mu is the mean, and sigma is the standard deviation.

plt.subplot(627)

plt.xlabel("random.lognormvariate")

mu = 1,sigma = 0.5,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.lognormvariate(mu, sigma))

plt.hist(res, buckets)

###正态分布 Normal distribution. mu is the mean, and sigma is the standard deviation.

plt.subplot(628)

plt.xlabel("random.normalvariate")

mu = 1,sigma = 0.5,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.normalvariate(mu, sigma))

plt.hist(res, buckets)

###帕累托分布，Pareto distribution. alpha is the shape parameter.

plt.subplot(629)

plt.xlabel("random.paretovariate")

alpha = 1,res = []

for _ in xrange(1, SAMPLE_SIZE):res.append(random.paretovariate(alpha))

plt.hist(res, buckets)

plt.tight_layout()

plt.show()

###########随机数

1)用seed初始化伪随机数生成器，这样random()方法就能生成相同的期望随机值。

比较有用，比预先生成随机数并保存到到文件中要好。

2)避免随机生成的序列重复，推荐使用random.SystemRandom,其底层使用os.urandom。

os.urandom提供更多熵源的访问,seed()和setstate()没有影响，则样本不可重现了。

例子:想要一些随机单词

#Linux，针对Unix系统

import random

with open('/usr/share/dict/words') as f:

words=f.readlines()

words=[w.rstrip() for w in words]

for w in random.sample(words,5):

print w

#Windows

http://norvig.com/big.txt

#==============================================================================

# 2．14 真实数据的噪声平滑处理 58

#==============================================================================

python数据处理2

推荐阅读更多精彩内容