import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
Import test and training data
Concatenate into combined dataframe for feature engineering
df_train = pd.read_csv("../resources/train.csv")
df_test = pd.read_csv("../resources/test.csv")
combine = pd.concat([df_train, df_test], sort=False)
df_train.shape, df_test.shape, combine.shape
df_test.iloc[:5, ]
combine.iloc[15120:15125, ]
The combine seems to have worked
To do
Deal with missing data Done
Look for patterns In Progress
Feature Engineering
Standardize features
Train models
Test
combine.columns
df_train['Cover_Type'].unique()
#combine[combine.notnull()].count()
Write syntax for showing the rows with a null value in one column
d = {'col1': [1, 2, 3, None],
'col2': [None, 2, 3, 4],
'col3': [None, None, 2, 3]}
df_na = pd.DataFrame(data=d)
df_na
df_na[df_na.apply(lambda x: x.count(), axis=1) != 3]
#df_train[df_train.apply(lambda x: x.count(), axis=1) != 56]
#df_test[df_test.apply(lambda x: x.count(), axis=1) != 55]
No null values
cust_palette = {
1: '#7fc97f',
2: '#beaed4',
3: '#fdc086',
4: '#ffff99',
5: '#386cb0',
6: '#f0027f',
7: '#bf5b18'
}
sns.set_palette(['#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#bf5b18'])
sns.palplot(pal=['#7fc97f','#beaed4','#fdc086','#ffff99','#386cb0','#f0027f','#bf5b18'])
g = sns.scatterplot(data=df_train, x='Id', y='Elevation', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02), title='Cover_Type')
Visual differences in Cover Type by Elevation
To do
Done (FeatEng.ipynb) Create categories for Elevation, eg <3000, 3000-3250, >3250
g = sns.scatterplot(data=df_train, x='Id', y='Aspect', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
No discernible differences in Cover Type by Aspect
Hypothesis: The bands of Cover Type by Id come from differences in aspect within groups of same Cover Type
g = sns.scatterplot(data=df_train, x='Id', y='Slope', hue='Cover_Type', alpha=.15, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
#Changed alpha from to .15 after initial run to get closer look at low-valued cover types
Suspect that the same thing is happening with slope as with aspect
There could be a concentration of low-value Cover Type at lower slopes
To do
Done Find proportion of Cover Type by Slope and maybe make categories, eg <20 and >20
g = sns.scatterplot(data=df_train, x='Id', y='Horizontal_Distance_To_Hydrology', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
Possible relationship between high HDH and high-valued cover types
To do
Done Look at proportion of cover types in HDH group >600/>800
g = sns.scatterplot(data=df_train, x='Id', y='Vertical_Distance_To_Hydrology', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
Possible relationship between high VDH and high-valued cover types
Possible relationship between negative VDH and high-valued cover types
To do
Done Look at proportion of cover types in VDH group >200 and <0
g = sns.scatterplot(data=df_train, x='Id', y='Horizontal_Distance_To_Roadways', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
Cool pattern!
Possible grouping of mid-valued Cover Types at low HDR and of high-valued Cover Types at mid HDR
To do
Done Look for groupings of HDR, eg <2000, 2000-5000, >5000
combine[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].describe()
Hillshade at 9 and Noon look very similar from summary statistics
To do
Done Consider combining 9 and Noon
Done Look at shape of distribution for each hillshade
cmap = {'Hillshade_9am': '#1b9e77', 'Hillshade_Noon': '#d95f02', 'Hillshade_3pm': '#7570b3'}
g = sns.kdeplot(df_train['Hillshade_9am'], color=cmap['Hillshade_9am'])
g = sns.kdeplot(df_train['Hillshade_Noon'], color=cmap['Hillshade_Noon'])
g = sns.kdeplot(df_train['Hillshade_3pm'], color=cmap['Hillshade_3pm'])
Hillshade 9am and Noon have visually similar distributions as well as similar summary statistics
3pm has a distinct distribution
To do
Done Would be curious if Cover Type had a different relationship to Hillshade at 9am and Noon
fig, (g1, g2, g3) = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(18, 5))
g1.scatter(data=df_train, x='Id', y='Hillshade_9am', c='Cover_Type', alpha=.15, cmap='tab10')
g2.scatter(data=df_train, x='Id', y='Hillshade_Noon', c='Cover_Type', alpha=.15, cmap='tab10')
g3.scatter(data=df_train, x='Id', y='Hillshade_3pm', c='Cover_Type', alpha=.15, cmap='tab10')
g1.set_title('Hillshade_9am')
g2.set_title('Hillshade_Noon')
g3.set_title('Hillshade_3pm')
No visible difference in Cover Type according to Hillshade at any time
To do
Add legend for Cover Types
g = sns.scatterplot(data=df_train, x='Id', y='Horizontal_Distance_To_Fire_Points', hue='Cover_Type', alpha=.3, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
Possible grouping of high valued cover types at middle HDFP and of middle values at low HDFP
Will run again at lower alpha
g = sns.scatterplot(data=df_train, x='Id', y='Horizontal_Distance_To_Fire_Points', hue='Cover_Type', alpha=.15, palette=cust_palette)
g.legend(range(1, 8), bbox_to_anchor=(1, 1.02))
To do
Done Find proportion of high-valued cover type at mid to low HDFP, unsure of a trend
Done Look at proportion of middle valued cover types, 4 and 5, at low HDFP, eg < 3000
def Wild_Area_Cat(row):
if row['Wilderness_Area1'] == 1:
return 1
elif row['Wilderness_Area2'] == 1:
return 2
elif row['Wilderness_Area3'] == 1:
return 3
elif row['Wilderness_Area4'] == 1:
return 4
Test function for categorizing Wilderness Areas
df_train[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']][df_train['Wilderness_Area1'] == 0].head()
df_train.iloc[1620:1628, :][['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']]
for i in range(1620, 1628):
print(Wild_Area_Cat(df_train.iloc[i, :]))
Wild_Area_Cat behaves as desired
Came up with new idea
Divide df_train by Wilderness area and use subplots
Keep categorizing function, just in case
area1Cover = df_train['Id'][df_train['Wilderness_Area1'] == 1].groupby(by=df_train['Cover_Type']).count()
area1Dict = {}
for i in range(1, 8):
try:
area1Dict[str(i)] = area1Cover[i]
except KeyError:
area1Dict[str(i)] = 0
area1Dict
Code for count behaves as desired
Turn Cover Type into hashable data type, string
area2Cover = df_train['Id'][df_train['Wilderness_Area2'] == 1].groupby(by=df_train['Cover_Type']).count()
area2Dict = {}
for i in range(1, 8):
try:
area2Dict[str(i)] = area2Cover[i]
except KeyError:
area2Dict[str(i)] = 0
area2Dict
area3Cover = df_train['Id'][df_train['Wilderness_Area3'] == 1].groupby(by=df_train['Cover_Type']).count()
area3Dict = {}
for i in range(1, 8):
try:
area3Dict[str(i)] = area3Cover[i]
except KeyError:
area3Dict[str(i)] = 0
area3Dict
area4Cover = df_train['Id'][df_train['Wilderness_Area4'] == 1].groupby(by=df_train['Cover_Type']).count()
area4Dict = {}
for i in range(1, 8):
try:
area4Dict[str(i)] = area4Cover[i]
except KeyError:
area4Dict[str(i)] = 0
area4Dict
fig, (g1, g2, g3, g4) = plt.subplots(nrows=1, ncols=4, sharey=True, figsize=(18, 4))
g1.bar(x=list(area1Dict.keys()),
height=list(area1Dict.values()),
color='#a6cee3')
g2.bar(x=list(area2Dict.keys()),
height=list(area2Dict.values()),
color='#1f78b4')
g3.bar(x=list(area3Dict.keys()),
height=list(area3Dict.values()),
color='#b2df8a')
g4.bar(x=list(area4Dict.keys()),
height=list(area4Dict.values()),
color='#33a02c')
g1.set_title('Wilderness_Area1')
g2.set_title('Wilderness_Area2')
g3.set_title('Wilderness_Area3')
g4.set_title('Wilderness_Area4')
Wilderness Area1 has similar numbers of cover types 1 and 2, and decreasing amounts of 5 and 7
Area2 has very few observations, but all are of cover type 7, 1, and 2 in decreasing order
Area3 had similar amounts of types 5 and 7 and of 1, 2, 3 and 6. The only type not represented is 4
Area4 has many type 4, similar amounts of 3 and 6 and very few type 2
Observations
Cover Type 4 is only observed in Area4
Cover Type 5 is only observed in Areas 1 and 3
Area2 has too few observations to conclude any trends
To do
Done Look at proportions of each wilderness area in combined data
soilTypes = combine.columns[combine.columns.str.contains(r'^Soil')]
soilTypes
Soil1Cover = df_train['Id'][df_train['Soil_Type1'] == 1].groupby(by=df_train['Cover_Type']).count()
Soil1Dict = {}
for i in range(1, 8):
try:
Soil1Dict[str(i)] = Soil1Cover[i]
except KeyError:
Soil1Dict[str(i)] = 0
Soil1Dict
Looks like there could be useful relationships within the Soil Types
SoilDictAll = []
SoilDict = {}
for type in soilTypes: #Loop through soil types
SoilCover = df_train['Id'][df_train[type] == 1].groupby(by=df_train['Cover_Type']).count() #Make Series for the soil type
if len(SoilCover) == 0: #Catch error edge case found in Soil_Type7
SoilDict = { '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0}
else:
for i in range(1, 8): #Make dict for cover type counts by soil type
try:
SoilDict[str(i)] = SoilCover[i]
except KeyError:
SoilDict[str(i)] = 0
SoilDictAll.append(SoilDict) #Add dict to list for all soil types
SoilDict = {} #Reset dict for next soil type
df_train['Id'][df_train['Soil_Type3'] == 1].groupby(by=df_train['Cover_Type']).count()
SoilDictAll[2]
Dict of soil types constructed correctly
fig, axs = plt.subplots(5, 8, sharey=True, figsize=(200, 100))
for i in range(5): #Plot bar graphs for each soil type
for j in range(8):
axs[i][j].bar(x=list(SoilDictAll[j + 8 * i].keys()),
height=list(SoilDictAll[j + 8 * i].values()),
color='#004fce')
axs[i][j].set_title('Soil_Type{}'.format(j + 8 * i + 1))
Looks like a whole bunch of insignificant relationships
Suspect that there are not enough observations of most soil types in df_train to draw conclusions
See some weak similarities in cover type counts by soil type, eg ST 5-6, 22-24, ~29-33, or 38-40
To do
Look up whether there are similarities between soil types with consecutive numbers
combine.columns