Data Science #6: What makes a good Random Forest Classifier Feature
# First, import some modules we'll need for this exericse. Make sure you
# have matplotlib, numpy, pandas, and sklearn
# installed prior
# standard imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
# generate random data
# we know the city center at 42.0 degrees latitude and 144.0 degrees
# longitude. This is where 'the most' traffic occurs
# the city bounds are at 41.9 degrees and 42.2 degrees latitude and
# 143.8 degrees and 144.1 degrees longitude
# thus, a taxi cab is most likely to pick up here
# create random data list
lats=[random.uniform(41, 43) for i in range(50000)]
lons=[random.uniform(143, 145) for i in range(50000)]
# get the radial distance - using a^2 + b^2 = c^2
r = [np.sqrt((lats[i] - 42.0)**2 + (lons[i] - 144.0)**2) for i in range(50000)]
r2=np.array(r)
# taxi pick up time is based on probability determined by exponential decay
# with radial distance
# we use probabilities in order to add some variability to the data,
# making it more challenging for our model
# to succeed
taxiResult= [ np.random.choice([0,1], p=[1.- np.exp(-1.0*(2*r2[j])),np.exp(-1.0*(2*r2[j]))]) for j in range(50000)]
tR=np.array(taxiResult)
print('number of taxis within 5 minutes: '+str(len(tR[tR==1])))
print('number of taxis not within 5 mintues: '+str(len(tR[tR==0])))
# now turn into a data frame for modeling
df=pd.DataFrame({'Lat':lats, 'Lon':lons, 'R':r, 'TaxiResult':taxiResult})
df.head(10)
%matplotlib inline
# we are going to create a closeness variable to predict with
closeness=np.zeros(50000)
# first, we need to figure out where our probability of getting picked
# up within 5 minutes is highest.
# we are going to overplot the 'yes' (picked up) as blue and the 'no'
# (not picked up) as green.
fig=plt.figure()
plt.hist(r2[tR==1], bins=np.linspace(0, 2, 50), alpha=0.3, color='blue')
plt.hist(r2[tR==0], bins=np.linspace(0, 2, 50), alpha=0.3, color='green')
fig=plt.figure()
plt.show()
closeness[r2<0.3]=1
closeness[r2>=0.3]=0
df['closeness']=closeness
cdf=r2[tR==1]
# great now apply random forest classifier
train, test = train_test_split(df, test_size = 0.25,random_state=2)
#
# test success of models using different features
features_ll=['Lat', 'Lon']
features_r=['R']
features_cn=['closeness']
# set up the classifier, no special features here
clf_ll=RandomForestClassifier()
clf_r=RandomForestClassifier()
clf_cn=RandomForestClassifier()
# train the model
clf_ll.fit(train[features_ll], train['TaxiResult'])
clf_r.fit(train[features_r], train['TaxiResult'])
clf_cn.fit(train[features_cn], train['TaxiResult'])
# make a prediction
predict_ll=clf_ll.predict(test[features_ll])
predict_r=clf_r.predict(test[features_r])
predict_cn=clf_cn.predict(test[features_cn])
# score with r^2, it's going to be ugly here
r2_ll=metrics.r2_score(test['TaxiResult'], predict_ll)
r2_r=metrics.r2_score(test['TaxiResult'], predict_r)
r2_cn=metrics.r2_score(test['TaxiResult'], predict_cn)
# score with confusion matrix. From top row across, it goes true positive, false positive,
# false negative, true negative
c_ll=metrics.confusion_matrix(test['TaxiResult'], predict_ll)
c_r=metrics.confusion_matrix(test['TaxiResult'], predict_r)
c_cn=metrics.confusion_matrix(test['TaxiResult'], predict_cn)
print('r2 for Lat/Lon: '+str(r2_ll))
print('r2 for R: '+str(r2_r))
print('r2 for Closeness: '+str(r2_cn))
print('Confusion Matrix for Lat/Lon: '+str(c_ll))
print('Confusion Matrix for R: '+str(c_r))
print('Confusion Matrix for Closeness: '+str(c_cn))
# print out the accuracy (how many successful hits)
print('Accuracy Score for Lat/Lon: '+ str(metrics.accuracy_score(test['TaxiResult'], predict_ll)))
print('Accuracy Score for R: '+ str(metrics.accuracy_score(test['TaxiResult'], predict_r)))
print('Accuracy Score for Closeness: '+ str(metrics.accuracy_score(test['TaxiResult'], predict_cn)))
1 Comments