import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
df = pd.read_csv('../resources/owm09172017_09172018.csv')
Overview of the dataset
df.head()
df.shape
df.columns
All data is, by design, from the Central Park station
Ideas
Cloud cover sunlight/present weather
Could I do something neat with wind direction and speed? Some kind of cartesian coordinates with vectors for wind?
Could choose daily data if hourly is too cluttered. Choose daily or monthly if displaying muliple years' worth of data
columns_keep = ['DATE', 'HOURLYSKYCONDITIONS', 'HOURLYVISIBILITY', 'HOURLYPRSENTWEATHERTYPE', 'HOURLYRelativeHumidity', 'HOURLYWindSpeed', 'HOURLYWindDirection',
'HOURLYWindGustSpeed', 'HOURLYPrecip', 'DAILYSunrise', 'DAILYSunset', 'DAILYWeather', 'DAILYPrecip', 'DAILYSnowfall']
df_thin = df[columns_keep]
df_thin.head()
Missing data
df_thin.isna().sum()
Available Data
df_thin.shape[0] - df_thin.isna().sum()
Drop daily values for now
df_skinny = df_thin.drop(['DAILYWeather', 'DAILYPrecip', 'DAILYSnowfall'], axis=1)
df_skinny.shape[0] - df_skinny.isna().sum()
Create features for DATE parts
df_skinny['DATE'].head()
def get_year(row): # Get the year from DATE column
return int(row[:4])
df_skinny['Year'] = df_skinny['DATE'].apply(get_year)
def get_month(row): # Get the month from DATE column
return int(row[5:7])
df_skinny['Month'] = df_skinny['DATE'].apply(get_month)
def get_day(row): # Get the day from DATE column
return int(row[8:10])
df_skinny['Day'] = df_skinny['DATE'].apply(get_day)
def get_hour(row): # Get the hour from DATE column
return int(row[11:13])
df_skinny['Hour'] = df_skinny['DATE'].apply(get_hour)
Create matching features for sunrise and sunset
df_skinny['SunriseHour'] = df_skinny['DAILYSunrise'] // 100
df_skinny['SunriseMin'] = df_skinny['DAILYSunrise'] % 100
df_skinny['SunsetHour'] = df_skinny['DAILYSunset'] // 100
df_skinny['SunsetMin'] = df_skinny['DAILYSunset'] % 100
df_skinny['HOURLYSKYCONDITIONS'].head()
def get_weather_type(row): #Get the weather type from HOURLYSKYCONDITIONS column
if type(row) == str:
search = re.findall(r'([A-Z]*)', row)
return search[0]
else:
return 'NaN'
df_skinny['HourlySkyWeather'] = df_skinny['HOURLYSKYCONDITIONS'].apply(get_weather_type)
df_skinny['HourlySkyWeather'].head()
set(df_skinny['HourlySkyWeather'])
'10' which was supposed to denote partly cloudy, does that entry exist in the original feature? No
df_skinny['HOURLYSKYCONDITIONS'][df_skinny['HOURLYSKYCONDITIONS'].str.startswith('10', na=False)]
Could try to extract all weather types from HOURLYSKYCONDITIONS, but hold off until after looking at HOURLYPRSENTWEATHERTYPE
df_skinny['HOURLYVISIBILITY'].describe()
set(df_skinny['HOURLYVISIBILITY'])
Remove the trailing V because the documentation does not provide a justification for it
def get_visibility(row): # Create tidy hourly visibility feature
if type(row) == str:
search = re.search(r'[0-9\.]*', row)
return float(search[0])
else:
return 'NaN'
df_skinny['HourlyVisibility'] = df_skinny['HOURLYVISIBILITY'].apply(get_visibility)
set(df_skinny['HourlyVisibility'])
df_skinny['HOURLYPRSENTWEATHERTYPE'].describe()
HOURLYPRSENTWEATHERTYPE is formatted as AU|AW|MW where AU and AW are automatic weather type measurements and MW is a manual measurement
set(df_skinny['HOURLYPRSENTWEATHERTYPE'])
Test regex
string = 'SN:03 FZ:8 FG:2 |FG:35 SN:72 |'
re.split(r'\|', string)
def get_AU_weather(row): # Get the AU weather measurement
if type(row) == str:
search = re.split(r'\|', row)
return search[0]
else:
return 'NaN'
def get_AW_weather(row): # Get the AW weather measurement
if type(row) == str:
search = re.split(r'\|', row)
return search[1]
else:
return 'NaN'
def get_MW_weather(row): # Get the AU weather measurement
if type(row) == str:
search = re.split(r'\|', row)
return search[2]
else:
return 'NaN'
df_skinny['AUWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_AU_weather)
df_skinny['AWWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_AW_weather)
df_skinny['MWWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_MW_weather)
set(df_skinny['AUWeather'])
Refine these weather type features if they are to be used
df_skinny['HOURLYRelativeHumidity'].describe()
fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYRelativeHumidity'][df_skinny['HOURLYRelativeHumidity'].isna() == False], bins=25)
axs.set_title('Hourly Relative Humidity')
df_skinny['HOURLYWindSpeed'].describe()
fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindSpeed'][df_skinny['HOURLYWindSpeed'].isna() == False], bins=30)
axs.set_title('Hourly Wind Speed')
Indication of rounding for wind speed at low values, eg. at 0, 3, 5 are used in place of 1, 2 and 4
Wind direction is given in degrees from true north, with 360 as North and 000 as calm conditions
No explanation in the documentation for VRB, so treat it as a missing value
df_skinny['HOURLYWindDirection'].describe()
set(df_skinny['HOURLYWindDirection'])
Make a numeric feature for Wind Direction
def get_numeric_wind(row): # Create a numeric Wind Direction feature and remove VRB encoding
if str(row).isnumeric() == True:
return int(row)
else:
return False
df_skinny['HourlyWindDirection'] = df_skinny['HOURLYWindDirection'].apply(get_numeric_wind)
set(df_skinny['HourlyWindDirection'])
df_skinny['DATE'].groupby(by = df_skinny['HourlyWindDirection']).count()
df_skinny['DATE'].groupby(by = df_skinny['HourlyWindDirection']).count() / df_skinny['DATE'][df_skinny['HourlyWindDirection'].isna() == False].count()
df_wind = df_skinny['DATE'][df_skinny['HourlyWindDirection'] != 0].groupby(by = df_skinny['HourlyWindDirection']).count() / df_skinny['DATE'][df_skinny['HourlyWindDirection'].isna() == False][df_skinny['HourlyWindDirection'] != 0].count()
df_wind.sort_values(ascending = False)[:10]
The most frequent nonzero wind direction values belong to east and west directions, ie 50-70 for west and 260-300 for east
df_skinny['HOURLYWindSpeed'].describe()
fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindSpeed'][df_skinny['HOURLYWindSpeed'].isna() == False], bins = 30)
axs.set_title('Hourly Wind Speed')
Again, there is evidence of rounding at low integer values, ie at 0, 3, 5 instead of 1, 2 and 4
df_skinny['HOURLYWindGustSpeed'].describe()
fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindGustSpeed'][df_skinny['HOURLYWindGustSpeed'].isna() == False], bins = 30)
axs.set_title('Hourly Wind Gust Speed')
There was probably rounding at wind gust speeds approaching 20 mph
Make feature for minutesof sunlight
df_skinny[['SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin']].head()
df_skinny['SunlightMin'] = (df_skinny['SunsetHour'] - df_skinny['SunriseHour']) * 60 + df_skinny['SunsetMin'] - df_skinny['SunriseMin']
df_skinny[['SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin', 'SunlightMin']].iloc[100]
# Spot checking the results
Remove troublesome values the features to be used in visualization:
Year, Month, Day, Hour, SunlightMin, HourlySkyWeather, HourlyVisibility, HourlyWindDirection, HOURLYWindSpeed
keep = ['Year', 'Month', 'Day', 'Hour', 'SunlightMin', 'HourlySkyWeather', 'HourlyVisibility', 'HourlyWindDirection', 'HOURLYWindSpeed']
df_skinny[keep].count()
Trear HOURLYWindSpeed for missing values
df_skinny['HOURLYWindSpeed'].head()
def get_wind_speed(row): # Create hourly wind speed feature
if pd.isna(row) == False:
return row
else:
return 'NaN'
df_skinny['HourlyWindSpeed'] = df_skinny['HOURLYWindSpeed'].apply(get_wind_speed)
df_skinny['HourlyWindSpeed'].count()
keep = ['Year', 'Month', 'Day', 'Hour', 'SunlightMin', 'HourlySkyWeather', 'HourlyVisibility', 'HourlyWindDirection', 'HourlyWindSpeed', 'SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin']
df_keep = df_skinny[keep]
df_keep.count()
df_keep.columns
df_keep.iloc[0]
df_keep.iloc[-1]
df_keep['Year'].groupby(by = df_keep['Hour']).count()
Something weird might be going on with the time features...
df_keep[df_keep['Hour'] == 23].head()
df_keep[df_keep['Hour'] == 22].head()
There are duplicate rows!
df_keep['HourlySkyWeather'].groupby(by = [df_keep['Year'], df_keep['Month'], df_keep['Day'], df_keep['Hour']]).count()
df_dedup = df_keep.drop_duplicates(subset = ['Year', 'Month', 'Day', 'Hour'], keep = 'first')
df_dedup['HourlySkyWeather'].groupby(by = [df_keep['Year'], df_keep['Month'], df_keep['Day'], df_keep['Hour']]).count()
df_dedup.shape[0] / 24
Might be missing some data but I'll try using df_dedup
df_dedup.to_csv('../resources/weatherData.csv', index = False)
df_dedup['SunlightMin'].describe()
24 * 60 #Minutes in a day
1440 - df_dedup['SunlightMin'].max() #Minimum minutes of darkness
1440 - df_dedup['SunlightMin'].min() #Maximum minutes of darkness
df_dedup['SunriseHour'].describe()
df_dedup['SunsetHour'].describe()
set(df_dedup['HourlyVisibility'])
set(df_dedup['HourlySkyWeather'])