06-basic k-nearest neighbors + pandas取值技巧

获取data

data参见这里
RMSE & MSE的对比

import pandas as pd
dc_listings = pd.read_csv('dc_airbnb.csv')
#读取第一行，.iloc[0]
print(dc_listings.iloc[0])
#选择某一列某一行的值：
print (dc_listings["col"].iloc[0])

例子：我们有一个三个卧室的房子要租出去，判断租金应该定多少？
方法：在网上找和我们的房子类似的房子，看这些房子的平均租金，然后我们定这个租金。
热身：计算dc_listing里卧室那一列第一行的值，和3个卧室的欧几里得距离：

import math
a = dc_listings["accommodates"].iloc[0] #第一行的值
diff = (3-a)**2
first_distance = math.sqrt(diff) # 欧几里得距离
# 由于只有一个值，也可以直接用np.abs()来计算绝对值
print (first_distance)

对一整列的每个值都进行计算，可以用df["column"].apply()

# 对这一列的每个值，都计算和3的距离是多少
dc_listings["distance"] = dc_listings["accommodates"].apply(lambda x: np.abs(x-3))  
print (dc_listings["distance"].value_counts())

将dataframe的index打乱，打乱之后重新取值，取price

import numpy as np
np.random.seed(1)
#打乱index，相当于: shuffled_index = np.random.permutation(len(df))
# df = df.loc[shuffled_index]
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
dc_listings = dc_listings.sort_values('distance')
print(dc_listings.iloc[0:10]['price'])

对price这一列的值进行处理，去掉", $": df['col'].str.replace(",","")
将结果变成float形式: series.astype('float')
选前5行的平均值：df["col"].iloc[0:5].mean()

stripped_commas = dc_listings["price"].str.replace(",", "")
stripped_commas = stripped_commas.str.replace("$","")
dc_listings["price"] = stripped_commas.astype('float')
mean_price = dc_listings["price"].iloc[0:5].mean()
print (mean_price)

把以上这些步骤写成一个小方程，来对不同的accommodate的平均价格做判断

# Brought along the changes we made to the `dc_listings` Dataframe.
import numpy as np
dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]

def predict_price(new_listing):
    ## Complete the function.
    temp_df = dc_listings
    temp_df["distance"] = temp_df["accommodates"].apply(lambda x: np.abs(int(new_listing) - x))
    temp_df = temp_df.sort_values("distance", ascending = True)
    price = temp_df["price"].iloc[0:5].mean()
    return(price)

acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)

Cross validation

把数据集分为train和test两组，假设test那一组里面，accommodate列的第一个值是5，就用5去和train那一组中accommodate列的每一行的值去比较，算距离，然后按distance排序，选前五个price的平均值，作为predicted price，放到test表里的price列。

import pandas as pd
import numpy as np

dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df  #用train_df表，来计算距离
    temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbor_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbor_prices.mean()
    return(predicted_price)

test_df["predicted_price"] = test_df["accommodates"].apply(lambda x: predict_price(x))

检验预测值是否准确

方法一：计算mean absolute error
mae = (|预测值1 - 实际值1| + |预测值2-实际值2| + ... + |预测值n - 实际值n|) / n

mae = np.abs(test_df["predicted_price"] - test_df["price"]).mean()

方法二：计算mean squared error
mse = ((预测值1-实际值1）^2 + (预测值2-实际值2)^2 + ...) / n

test_df["sm"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["sm"].mean()
#如果写成一行，就是：
mse = ((test_df["predicted_price"] - test_df["price"])**(2)).mean()

以上模型，是依靠 "accommodate"这一个变量出发建立的模型，需要其他的模型进行对比，来判断mse是高还是低

用test_df["bathroom"]来建立一个新的模型，计算mse

train_df = dc_listings.iloc[0:2792]
test_df = dc_listings.iloc[2792:]

def predict_price(new_listing):
    temp_df = train_df
    temp_df['distance'] = temp_df['bathrooms'].apply(lambda x: np.abs(x - new_listing))
    temp_df = temp_df.sort_values('distance')
    nearest_neighbors_prices = temp_df.iloc[0:5]['price']
    predicted_price = nearest_neighbors_prices.mean()
    return(predicted_price)

test_df["predicted_price"]= test_df["bathrooms"].apply(lambda x: predict_price(x))
test_df["squared_error"] = (test_df["predicted_price"] - test_df["price"])**2
mse = test_df["squared_error"].mean()
print (mse)

rmse = np.sqrt(mse) # root of mean squared error
print (rmse)

选更多的参数，提高模型准确性

先看dc_listings表里面有哪些字段，有哪些为空
dc_listings.info()

import pandas as pd
import numpy as np
np.random.seed(1)

dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

print (dc_listings.head())

dc_listings.info()

把一些与living space无关，或者难以直接比较距离的字段去掉

drop_columns = ['room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count']
dc_listings = dc_listings.drop(drop_columns, axis=1)
print(dc_listings.isnull().sum())

把数据标准化
可以直接对整个df表做标准化处理，再把price一列换成原来的price值

normalized_listings = (dc_listings - dc_listings.mean())/(dc_listings.std())
normalized_listings['price'] = dc_listings['price']
print(normalized_listings.head(3))

Euclidean Distance

spicy里面有直接的公式可以计算两行之间的euclidean distance
distance.euclidean(first_listing, fifth_listing)

from scipy.spatial import distance
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[4][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
print(first_fifth_distance)

Nearest Neighbors

from sklearn.neighbors import KNeighborsRegressor

train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]
train_columns = ['accommodates', 'bathrooms']

# Instantiate ML model.
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')

# Fit model to data.
knn.fit(train_df[train_columns], train_df['price'])

# Use model to make predictions.
predictions = knn.predict(test_df[train_columns])

计算mean squared error以及root of mean squared error

from sklearn.metrics import mean_squared_error

train_columns = ['accommodates', 'bathrooms']
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute', metric='euclidean')
knn.fit(train_df[train_columns], train_df['price'])
predictions = knn.predict(test_df[train_columns])

two_features_mse = mean_squared_error(test_df["price"], predictions)

two_features_rmse = np.sqrt(two_features_mse)

print (two_features_mse, two_features_rmse)

用4个变量来训练模型

features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')


# Fit model to data.
knn.fit(train_df[features], train_df['price'])

# Use model to make predictions.
four_predictions = knn.predict(test_df[features])

from sklearn.metrics import mean_squared_error

four_mse = mean_squared_error(four_predictions, test_df['price'])

four_rmse = np.sqrt(four_mse)

print (four_mse, four_rmse)

Hyper_params

根据不同的k，计算k-neighbors

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

hyper_params = [1, 2, 3, 4, 5]

mse_values = []

for item in hyper_params:
    features = ["accommodates", "bedrooms", "bathrooms", "number_of_reviews"]
    knn = KNeighborsRegressor(n_neighbors=item, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse_values = mean_squared_error(test_df['price'], predictions)
    print (mse_values)

用range(1,21)将hyper params进一步扩大，进行计算

features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']

hyper_params = [x for x in range(1, 21)]

mse_values = list()

for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse = mean_squared_error(test_df['price'], predictions)
    mse_values.append(mse)
print(mse_values)
#绘制散点图
plt.scatter(hyper_params, mse_values)
plt.show()

再尝试一次，把除了price的列作为features，进行训练

hyper_params = [x for x in range(1,21)]
mse_values = list()
# 列名，去除price项
features = train_df.columns.tolist()
features.remove('price')

for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[features], train_df['price'])
    predictions = knn.predict(test_df[features])
    mse = mean_squared_error(test_df['price'], predictions)
    mse_values.append(mse)

plt.scatter(hyper_params, mse_values)
plt.show()

寻找最小的mse

two_features = ['accommodates', 'bathrooms']
three_features = ['accommodates', 'bathrooms', 'bedrooms']
hyper_params = [x for x in range(1,21)]
# Append the first model's MSE values to this list.
two_mse_values = list()
# Append the second model's MSE values to this list.
three_mse_values = list()
two_hyp_mse = dict()
three_hyp_mse = dict()
for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[two_features], train_df['price'])
    predictions = knn.predict(test_df[two_features])
    mse = mean_squared_error(test_df['price'], predictions)
    two_mse_values.append(mse)

#寻找最小的mse
two_lowest_mse = two_mse_values[0]
two_lowest_k = 1

for k,mse in enumerate(two_mse_values):
    if mse < two_lowest_mse:
        two_lowest_mse = mse
        two_lowest_k = k + 1
    
for hp in hyper_params:
    knn = KNeighborsRegressor(n_neighbors=hp, algorithm='brute')
    knn.fit(train_df[three_features], train_df['price'])
    predictions = knn.predict(test_df[three_features])
    mse = mean_squared_error(test_df['price'], predictions)
    three_mse_values.append(mse)
    
three_lowest_mse = three_mse_values[0]
three_lowest_k = 1

for k,mse in enumerate(three_mse_values):
    if mse < three_lowest_mse:
        three_lowest_mse = mse
        three_lowest_k = k + 1

two_hyp_mse[two_lowest_k] = two_lowest_mse
three_hyp_mse[three_lowest_k] = three_lowest_mse

print(two_hyp_mse)
print(three_hyp_mse)

重新做一遍 - a new project

import numpy as np
import pandas as pd

dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')

shuffled_index = np.random.permutation(len(dc_listings))

split_one = dc_listings.loc[shuffled_index][0:1862]
split_two = dc_listings.loc[shuffled_index][1862:len(dc_listings)]

Holdout Validation

把模型分为50% vs 50%，分别train和test

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

train_one = split_one
test_one = split_two
train_two = split_two
test_two = split_one
# First half
model = KNeighborsRegressor()
model.fit(train_one[["accommodates"]], train_one["price"])
test_one["predicted_price"] = model.predict(test_one[["accommodates"]])
iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1/2)

# Second half
model.fit(train_two[["accommodates"]], train_two["price"])
test_two["predicted_price"] = model.predict(test_two[["accommodates"]])
iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1/2)

avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse])

print(iteration_one_rmse, iteration_two_rmse, avg_rmse)

K-fold Validation

重新建一列，把dataframe分为5个部分，做K-fold Validation

# 按照index给新的列赋值的方式
dc_listings.set_value(dc_listings.index[0:744], "fold", 1)
dc_listings.set_value(dc_listings.index[744:1488], "fold", 2)
dc_listings.set_value(dc_listings.index[1488:2232], "fold", 3)
dc_listings.set_value(dc_listings.index[2232:2976], "fold", 4)
dc_listings.set_value(dc_listings.index[2976:3723], "fold", 5)

print (dc_listings["fold"].value_counts())

重新进行K-fold Validation
按照fold=1和fold=2~5进行划分test和train

# Training

model = KNeighborsRegressor()
train_iteration_one = dc_listings[dc_listings["fold"] != 1]
print (train_iteration_one)

test_iteration_one = dc_listings[dc_listings["fold"] == 1]
model.fit(train_iteration_one[["accommodates"]], train_iteration_one["price"])

# Predicting
labels = model.predict(test_iteration_one[["accommodates"]])
test_iteration_one["predicted_price"] = labels
iteration_one_mse = mean_squared_error(test_iteration_one["price"], test_iteration_one["predicted_price"])
iteration_one_rmse = iteration_one_mse ** (1/2)

做一个方程，轮流计算kfolds

# Use np.mean to calculate the mean.
import numpy as np
fold_ids = [1,2,3,4,5]

def train_and_validate(df, folds):
    rmses = []
    for item in folds:
        train = df[df["fold"]!=item]
        test = df[df["fold"] == item]
        knn = KNeighborsRegressor()
        knn.fit(train[["accommodates"]], train["price"])
        predictions = knn.predict(test[["accommodates"]])
        rmse = (mean_squared_error(test["price"], predictions))**0.5
        rmses.append(rmse)
    return rmses

rmses = train_and_validate(dc_listings, fold_ids)
avg_rmse = np.mean(rmses)
print (rmses, avg_rmse)

最后编辑于：2017.12.08 22:48:35

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 202,056评论 5赞 474
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 84,842评论 2赞 378
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 148,938评论 0赞 335
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,296评论 1赞 272
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,292评论 5赞 363
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,413评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,824评论 3赞 393
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,493评论 0赞 256
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,686评论 1赞 295
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,502评论 2赞 318
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,553评论 1赞 329
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,281评论 4赞 318
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,820评论 3赞 305
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,873评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,109评论 1赞 258
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 42,699评论 2赞 348
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,257评论 2赞 341