import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

df = pd.read_csv('../resources/owm09172017_09172018.csv')

/home/eric/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (10,11,20,23,25,39,40,44,45,59,63,64,67,68,84,86) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Overview of the dataset

df.head()

df.shape

(11959, 90)

df.columns

Index(['STATION', 'STATION_NAME', 'ELEVATION', 'LATITUDE', 'LONGITUDE', 'DATE',
       'REPORTTPYE', 'HOURLYSKYCONDITIONS', 'HOURLYVISIBILITY',
       'HOURLYPRSENTWEATHERTYPE', 'HOURLYDRYBULBTEMPF', 'HOURLYDRYBULBTEMPC',
       'HOURLYWETBULBTEMPF', 'HOURLYWETBULBTEMPC', 'HOURLYDewPointTempF',
       'HOURLYDewPointTempC', 'HOURLYRelativeHumidity', 'HOURLYWindSpeed',
       'HOURLYWindDirection', 'HOURLYWindGustSpeed', 'HOURLYStationPressure',
       'HOURLYPressureTendency', 'HOURLYPressureChange',
       'HOURLYSeaLevelPressure', 'HOURLYPrecip', 'HOURLYAltimeterSetting',
       'DAILYMaximumDryBulbTemp', 'DAILYMinimumDryBulbTemp',
       'DAILYAverageDryBulbTemp', 'DAILYDeptFromNormalAverageTemp',
       'DAILYAverageRelativeHumidity', 'DAILYAverageDewPointTemp',
       'DAILYAverageWetBulbTemp', 'DAILYHeatingDegreeDays',
       'DAILYCoolingDegreeDays', 'DAILYSunrise', 'DAILYSunset', 'DAILYWeather',
       'DAILYPrecip', 'DAILYSnowfall', 'DAILYSnowDepth',
       'DAILYAverageStationPressure', 'DAILYAverageSeaLevelPressure',
       'DAILYAverageWindSpeed', 'DAILYPeakWindSpeed', 'PeakWindDirection',
       'DAILYSustainedWindSpeed', 'DAILYSustainedWindDirection',
       'MonthlyMaximumTemp', 'MonthlyMinimumTemp', 'MonthlyMeanTemp',
       'MonthlyAverageRH', 'MonthlyDewpointTemp', 'MonthlyWetBulbTemp',
       'MonthlyAvgHeatingDegreeDays', 'MonthlyAvgCoolingDegreeDays',
       'MonthlyStationPressure', 'MonthlySeaLevelPressure',
       'MonthlyAverageWindSpeed', 'MonthlyTotalSnowfall',
       'MonthlyDeptFromNormalMaximumTemp', 'MonthlyDeptFromNormalMinimumTemp',
       'MonthlyDeptFromNormalAverageTemp', 'MonthlyDeptFromNormalPrecip',
       'MonthlyTotalLiquidPrecip', 'MonthlyGreatestPrecip',
       'MonthlyGreatestPrecipDate', 'MonthlyGreatestSnowfall',
       'MonthlyGreatestSnowfallDate', 'MonthlyGreatestSnowDepth',
       'MonthlyGreatestSnowDepthDate', 'MonthlyDaysWithGT90Temp',
       'MonthlyDaysWithLT32Temp', 'MonthlyDaysWithGT32Temp',
       'MonthlyDaysWithLT0Temp', 'MonthlyDaysWithGT001Precip',
       'MonthlyDaysWithGT010Precip', 'MonthlyDaysWithGT1Snow',
       'MonthlyMaxSeaLevelPressureValue', 'MonthlyMaxSeaLevelPressureDate',
       'MonthlyMaxSeaLevelPressureTime', 'MonthlyMinSeaLevelPressureValue',
       'MonthlyMinSeaLevelPressureDate', 'MonthlyMinSeaLevelPressureTime',
       'MonthlyTotalHeatingDegreeDays', 'MonthlyTotalCoolingDegreeDays',
       'MonthlyDeptFromNormalHeatingDD', 'MonthlyDeptFromNormalCoolingDD',
       'MonthlyTotalSeasonToDateHeatingDD',
       'MonthlyTotalSeasonToDateCoolingDD'],
      dtype='object')

All data is, by design, from the Central Park station
Ideas
Cloud cover sunlight/present weather
Could I do something neat with wind direction and speed? Some kind of cartesian coordinates with vectors for wind?
Could choose daily data if hourly is too cluttered. Choose daily or monthly if displaying muliple years' worth of data

columns_keep = ['DATE', 'HOURLYSKYCONDITIONS', 'HOURLYVISIBILITY', 'HOURLYPRSENTWEATHERTYPE', 'HOURLYRelativeHumidity', 'HOURLYWindSpeed', 'HOURLYWindDirection',
               'HOURLYWindGustSpeed', 'HOURLYPrecip', 'DAILYSunrise', 'DAILYSunset', 'DAILYWeather', 'DAILYPrecip', 'DAILYSnowfall']

df_thin = df[columns_keep]

df_thin.head()

Missing data

df_thin.isna().sum()

DATE                           0
HOURLYSKYCONDITIONS          393
HOURLYVISIBILITY             364
HOURLYPRSENTWEATHERTYPE     8792
HOURLYRelativeHumidity       362
HOURLYWindSpeed             1147
HOURLYWindDirection         1149
HOURLYWindGustSpeed        10162
HOURLYPrecip                1973
DAILYSunrise                   0
DAILYSunset                    0
DAILYWeather               11762
DAILYPrecip                11447
DAILYSnowfall              11595
dtype: int64

Available Data

df_thin.shape[0] - df_thin.isna().sum()

DATE                       11959
HOURLYSKYCONDITIONS        11566
HOURLYVISIBILITY           11595
HOURLYPRSENTWEATHERTYPE     3167
HOURLYRelativeHumidity     11597
HOURLYWindSpeed            10812
HOURLYWindDirection        10810
HOURLYWindGustSpeed         1797
HOURLYPrecip                9986
DAILYSunrise               11959
DAILYSunset                11959
DAILYWeather                 197
DAILYPrecip                  512
DAILYSnowfall                364
dtype: int64

Drop daily values for now

df_skinny = df_thin.drop(['DAILYWeather', 'DAILYPrecip', 'DAILYSnowfall'], axis=1)

df_skinny.shape[0] - df_skinny.isna().sum()

DATE                       11959
HOURLYSKYCONDITIONS        11566
HOURLYVISIBILITY           11595
HOURLYPRSENTWEATHERTYPE     3167
HOURLYRelativeHumidity     11597
HOURLYWindSpeed            10812
HOURLYWindDirection        10810
HOURLYWindGustSpeed         1797
HOURLYPrecip                9986
DAILYSunrise               11959
DAILYSunset                11959
dtype: int64

Create features for DATE parts

df_skinny['DATE'].head()

0    2017-09-16 00:51
1    2017-09-16 01:51
2    2017-09-16 02:51
3    2017-09-16 03:51
4    2017-09-16 03:59
Name: DATE, dtype: object

def get_year(row): # Get the year from DATE column
    return int(row[:4])
df_skinny['Year'] = df_skinny['DATE'].apply(get_year)

def get_month(row): # Get the month from DATE column
    return int(row[5:7])
df_skinny['Month'] = df_skinny['DATE'].apply(get_month)

def get_day(row): # Get the day from DATE column
    return int(row[8:10])
df_skinny['Day'] = df_skinny['DATE'].apply(get_day)

def get_hour(row): # Get the hour from DATE column
    return int(row[11:13])
df_skinny['Hour'] = df_skinny['DATE'].apply(get_hour)

Create matching features for sunrise and sunset

df_skinny['SunriseHour'] = df_skinny['DAILYSunrise'] // 100

df_skinny['SunriseMin'] = df_skinny['DAILYSunrise'] % 100

df_skinny['SunsetHour'] = df_skinny['DAILYSunset'] // 100

df_skinny['SunsetMin'] = df_skinny['DAILYSunset'] % 100

df_skinny['HOURLYSKYCONDITIONS'].head()

0    CLR:00
1    CLR:00
2    CLR:00
3    CLR:00
4    CLR:00
Name: HOURLYSKYCONDITIONS, dtype: object

def get_weather_type(row): #Get the weather type from HOURLYSKYCONDITIONS column
    if type(row) == str:
        search = re.findall(r'([A-Z]*)', row)
        return search[0]
    else:
        return 'NaN'

df_skinny['HourlySkyWeather'] = df_skinny['HOURLYSKYCONDITIONS'].apply(get_weather_type)

df_skinny['HourlySkyWeather'].head()

0    CLR
1    CLR
2    CLR
3    CLR
4    CLR
Name: HourlySkyWeather, dtype: object

set(df_skinny['HourlySkyWeather'])

{'BKN', 'CLR', 'FEW', 'NaN', 'OVC', 'SCT', 'VV'}

'10' which was supposed to denote partly cloudy, does that entry exist in the original feature? No

df_skinny['HOURLYSKYCONDITIONS'][df_skinny['HOURLYSKYCONDITIONS'].str.startswith('10', na=False)]

Series([], Name: HOURLYSKYCONDITIONS, dtype: object)

Could try to extract all weather types from HOURLYSKYCONDITIONS, but hold off until after looking at HOURLYPRSENTWEATHERTYPE

df_skinny['HOURLYVISIBILITY'].describe()

count     11595
unique       25
top       10.00
freq       7305
Name: HOURLYVISIBILITY, dtype: object

set(df_skinny['HOURLYVISIBILITY'])

{'0.00',
 '0.25',
 '0.50',
 '0.75',
 '0.75V',
 '1.00',
 '1.00V',
 '1.25',
 '1.25V',
 '1.50',
 '1.50V',
 '1.75',
 '1.75V',
 '10.00',
 '2.00',
 '2.00V',
 '2.50',
 '2.50V',
 '3.00',
 '4.00',
 '5.00',
 '6.00',
 '7.00',
 '8.00',
 '9.00',
 nan}

Remove the trailing V because the documentation does not provide a justification for it

def get_visibility(row): # Create tidy hourly visibility feature
    if type(row) == str:
        search = re.search(r'[0-9\.]*', row)
        return float(search[0])
    else:
        return 'NaN'

df_skinny['HourlyVisibility'] = df_skinny['HOURLYVISIBILITY'].apply(get_visibility)

set(df_skinny['HourlyVisibility'])

{0.0,
 0.25,
 0.5,
 0.75,
 1.0,
 1.25,
 1.5,
 1.75,
 10.0,
 2.0,
 2.5,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 'NaN'}

df_skinny['HOURLYPRSENTWEATHERTYPE'].describe()

count        3167
unique         37
top       BR:1 ||
freq         1035
Name: HOURLYPRSENTWEATHERTYPE, dtype: object

HOURLYPRSENTWEATHERTYPE is formatted as AU|AW|MW where AU and AW are automatic weather type measurements and MW is a manual measurement

set(df_skinny['HOURLYPRSENTWEATHERTYPE'])

{'+RA:02 BR:1 |RA:63 |',
 '+RA:02 BR:1 |RA:63 |RA:63',
 '+RA:02 FG:2 |FG:30 RA:63 |',
 '+RA:02 FG:2 |FG:30 RA:63 |RA:63',
 '+RA:02 |RA:63 |',
 '+RA:02 |RA:63 |RA:63',
 '+SN:03 FG:2 |FG:30 SN:73 |',
 '+SN:03 FZ:8 FG:2 |FG:35 SN:73 |',
 '-FZ:8 RA:02 BR:1 |FZRA:64 |',
 '-RA:02 BR:1 |RA:61 |',
 '-RA:02 BR:1 |RA:61 |RA:61',
 '-RA:02 FG:2 |FG:30 RA:61 |RA:61',
 '-RA:02 |RA:61 |',
 '-RA:02 |RA:61 |RA:61',
 '-RA:02 ||',
 '-SN:03 BR:1 |SN:71 |',
 '-SN:03 FZ:8 FG:2 |FG:35 SN:71 |',
 '-SN:03 |SN:71 |',
 'BR:1 ||',
 'FG:2 |FG:30 |',
 'FZ:8 RA:02 BR:1 |FZRA:65 |',
 'HZ:7 |FU:05 |HZ:05',
 'HZ:7 ||HZ:05',
 'RA:02 BR:1 |RA:62 |',
 'RA:02 BR:1 |RA:62 |RA:65',
 'RA:02 FG:2 |FG:30 RA:62 |RA:65',
 'RA:02 |RA:62 s |RA:65 s',
 'RA:02 |RA:62 |',
 'RA:02 |RA:62 |RA:65',
 'SN:03 FG:2 |FG:30 SN:72 |',
 'SN:03 FZ:8 FG:2 |FG:35 SN:72 |',
 'SN:03 |SN:72 s |',
 'UP:09 BR:1 ||',
 'UP:09 ||',
 nan,
 '|RA:61 |',
 '|RA:61 |RA:61',
 '|SN:71 |'}

Test regex

string = 'SN:03 FZ:8 FG:2 |FG:35 SN:72 |'
re.split(r'\|', string)

['SN:03 FZ:8 FG:2 ', 'FG:35 SN:72 ', '']

def get_AU_weather(row): # Get the AU weather measurement
    if type(row) == str:
        search = re.split(r'\|', row)
        return search[0]
    else:
        return 'NaN'
def get_AW_weather(row): # Get the AW weather measurement
    if type(row) == str:
        search = re.split(r'\|', row)
        return search[1]
    else:
        return 'NaN'
def get_MW_weather(row): # Get the AU weather measurement
    if type(row) == str:
        search = re.split(r'\|', row)
        return search[2]
    else:
        return 'NaN'

df_skinny['AUWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_AU_weather)
df_skinny['AWWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_AW_weather)
df_skinny['MWWeather'] = df_skinny['HOURLYPRSENTWEATHERTYPE'].apply(get_MW_weather)

set(df_skinny['AUWeather'])

{'',
 '+RA:02 ',
 '+RA:02 BR:1 ',
 '+RA:02 FG:2 ',
 '+SN:03 FG:2 ',
 '+SN:03 FZ:8 FG:2 ',
 '-FZ:8 RA:02 BR:1 ',
 '-RA:02 ',
 '-RA:02 BR:1 ',
 '-RA:02 FG:2 ',
 '-SN:03 ',
 '-SN:03 BR:1 ',
 '-SN:03 FZ:8 FG:2 ',
 'BR:1 ',
 'FG:2 ',
 'FZ:8 RA:02 BR:1 ',
 'HZ:7 ',
 'NaN',
 'RA:02 ',
 'RA:02 BR:1 ',
 'RA:02 FG:2 ',
 'SN:03 ',
 'SN:03 FG:2 ',
 'SN:03 FZ:8 FG:2 ',
 'UP:09 ',
 'UP:09 BR:1 '}

Refine these weather type features if they are to be used

df_skinny['HOURLYRelativeHumidity'].describe()

count    11597.000000
mean        70.075278
std         20.481264
min         12.000000
25%         53.000000
50%         72.000000
75%         90.000000
max        100.000000
Name: HOURLYRelativeHumidity, dtype: float64

fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYRelativeHumidity'][df_skinny['HOURLYRelativeHumidity'].isna() == False], bins=25)
axs.set_title('Hourly Relative Humidity')

Text(0.5,1,'Hourly Relative Humidity')

df_skinny['HOURLYWindSpeed'].describe()

count    10812.000000
mean         4.800499
std          3.687856
min          0.000000
25%          3.000000
50%          5.000000
75%          7.000000
max         21.000000
Name: HOURLYWindSpeed, dtype: float64

fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindSpeed'][df_skinny['HOURLYWindSpeed'].isna() == False], bins=30)
axs.set_title('Hourly Wind Speed')

Text(0.5,1,'Hourly Wind Speed')

Indication of rounding for wind speed at low values, eg. at 0, 3, 5 are used in place of 1, 2 and 4

Wind direction is given in degrees from true north, with 360 as North and 000 as calm conditions
No explanation in the documentation for VRB, so treat it as a missing value

df_skinny['HOURLYWindDirection'].describe()

count     10810
unique       38
top         VRB
freq       4622
Name: HOURLYWindDirection, dtype: object

set(df_skinny['HOURLYWindDirection'])

{'000',
 '010',
 '020',
 '030',
 '040',
 '050',
 '060',
 '070',
 '080',
 '090',
 '100',
 '110',
 '120',
 '130',
 '140',
 '150',
 '160',
 '170',
 '180',
 '190',
 '200',
 '210',
 '220',
 '230',
 '240',
 '250',
 '260',
 '270',
 '280',
 '290',
 '300',
 '310',
 '320',
 '330',
 '340',
 '350',
 '360',
 'VRB',
 nan}

Make a numeric feature for Wind Direction

def get_numeric_wind(row): # Create a numeric Wind Direction feature and remove VRB encoding
    if str(row).isnumeric() == True:
        return int(row)
    else:
        return False

df_skinny['HourlyWindDirection'] = df_skinny['HOURLYWindDirection'].apply(get_numeric_wind)

set(df_skinny['HourlyWindDirection'])

{0,
 10,
 20,
 30,
 40,
 50,
 60,
 70,
 80,
 90,
 100,
 110,
 120,
 130,
 140,
 150,
 160,
 170,
 180,
 190,
 200,
 210,
 220,
 230,
 240,
 250,
 260,
 270,
 280,
 290,
 300,
 310,
 320,
 330,
 340,
 350,
 360}

df_skinny['DATE'].groupby(by = df_skinny['HourlyWindDirection']).count()

HourlyWindDirection
0      8331
10       59
20       57
30       96
40      141
50      250
60      336
70      238
80       97
90       43
100      35
110      30
120      26
130      27
140      28
150      55
160      98
170      90
180      60
190      24
200       9
210      11
220      10
230      24
240      95
250     152
260     186
270     168
280     286
290     317
300     287
310     158
320      30
330      11
340      30
350      33
360      31
Name: DATE, dtype: int64

df_skinny['DATE'].groupby(by = df_skinny['HourlyWindDirection']).count() / df_skinny['DATE'][df_skinny['HourlyWindDirection'].isna() == False].count()

HourlyWindDirection
0      0.696630
10     0.004934
20     0.004766
30     0.008027
40     0.011790
50     0.020905
60     0.028096
70     0.019901
80     0.008111
90     0.003596
100    0.002927
110    0.002509
120    0.002174
130    0.002258
140    0.002341
150    0.004599
160    0.008195
170    0.007526
180    0.005017
190    0.002007
200    0.000753
210    0.000920
220    0.000836
230    0.002007
240    0.007944
250    0.012710
260    0.015553
270    0.014048
280    0.023915
290    0.026507
300    0.023999
310    0.013212
320    0.002509
330    0.000920
340    0.002509
350    0.002759
360    0.002592
Name: DATE, dtype: float64

df_wind = df_skinny['DATE'][df_skinny['HourlyWindDirection'] != 0].groupby(by = df_skinny['HourlyWindDirection']).count() / df_skinny['DATE'][df_skinny['HourlyWindDirection'].isna() == False][df_skinny['HourlyWindDirection'] != 0].count()

df_wind.sort_values(ascending = False)[:10]

HourlyWindDirection
60     0.092613
290    0.087376
300    0.079107
280    0.078831
50     0.068908
70     0.065601
260    0.051268
270    0.046307
310    0.043550
250    0.041896
Name: DATE, dtype: float64

The most frequent nonzero wind direction values belong to east and west directions, ie 50-70 for west and 260-300 for east

df_skinny['HOURLYWindSpeed'].describe()

count    10812.000000
mean         4.800499
std          3.687856
min          0.000000
25%          3.000000
50%          5.000000
75%          7.000000
max         21.000000
Name: HOURLYWindSpeed, dtype: float64

fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindSpeed'][df_skinny['HOURLYWindSpeed'].isna() == False], bins = 30)
axs.set_title('Hourly Wind Speed')

Text(0.5,1,'Hourly Wind Speed')

Again, there is evidence of rounding at low integer values, ie at 0, 3, 5 instead of 1, 2 and 4

df_skinny['HOURLYWindGustSpeed'].describe()

count    1797.000000
mean       21.293823
std         4.252267
min        16.000000
25%        18.000000
50%        21.000000
75%        23.000000
max        44.000000
Name: HOURLYWindGustSpeed, dtype: float64

fig = plt.figure()
axs = fig.add_subplot(111)
axs.hist(df_skinny['HOURLYWindGustSpeed'][df_skinny['HOURLYWindGustSpeed'].isna() == False], bins = 30)
axs.set_title('Hourly Wind Gust Speed')

Text(0.5,1,'Hourly Wind Gust Speed')

There was probably rounding at wind gust speeds approaching 20 mph

Make feature for minutesof sunlight

df_skinny[['SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin']].head()

df_skinny['SunlightMin'] = (df_skinny['SunsetHour'] - df_skinny['SunriseHour']) * 60 + df_skinny['SunsetMin'] - df_skinny['SunriseMin']

df_skinny[['SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin', 'SunlightMin']].iloc[100]
# Spot checking the results

SunriseHour      5
SunriseMin      39
SunsetHour      18
SunsetMin        0
SunlightMin    741
Name: 100, dtype: int64

Remove troublesome values the features to be used in visualization:
Year, Month, Day, Hour, SunlightMin, HourlySkyWeather, HourlyVisibility, HourlyWindDirection, HOURLYWindSpeed

keep = ['Year', 'Month', 'Day', 'Hour', 'SunlightMin', 'HourlySkyWeather', 'HourlyVisibility', 'HourlyWindDirection', 'HOURLYWindSpeed']

df_skinny[keep].count()

Year                   11959
Month                  11959
Day                    11959
Hour                   11959
SunlightMin            11959
HourlySkyWeather       11959
HourlyVisibility       11959
HourlyWindDirection    11959
HOURLYWindSpeed        10812
dtype: int64

Trear HOURLYWindSpeed for missing values

df_skinny['HOURLYWindSpeed'].head()

0    0.0
1    0.0
2    3.0
3    3.0
4    0.0
Name: HOURLYWindSpeed, dtype: float64

def get_wind_speed(row): # Create hourly wind speed feature
    if pd.isna(row) == False:
        return row
    else:
        return 'NaN'

df_skinny['HourlyWindSpeed'] = df_skinny['HOURLYWindSpeed'].apply(get_wind_speed)

df_skinny['HourlyWindSpeed'].count()

11959

keep = ['Year', 'Month', 'Day', 'Hour', 'SunlightMin', 'HourlySkyWeather', 'HourlyVisibility', 'HourlyWindDirection', 'HourlyWindSpeed', 'SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin']

df_keep = df_skinny[keep]

df_keep.count()

Year                   11959
Month                  11959
Day                    11959
Hour                   11959
SunlightMin            11959
HourlySkyWeather       11959
HourlyVisibility       11959
HourlyWindDirection    11959
HourlyWindSpeed        11959
SunriseHour            11959
SunriseMin             11959
SunsetHour             11959
SunsetMin              11959
dtype: int64

df_keep.columns

Index(['Year', 'Month', 'Day', 'Hour', 'SunlightMin', 'HourlySkyWeather',
       'HourlyVisibility', 'HourlyWindDirection', 'HourlyWindSpeed',
       'SunriseHour', 'SunriseMin', 'SunsetHour', 'SunsetMin'],
      dtype='object')

df_keep.iloc[0]

Year                   2017
Month                     9
Day                      16
Hour                      0
SunlightMin             747
HourlySkyWeather        CLR
HourlyVisibility          7
HourlyWindDirection       0
HourlyWindSpeed           0
SunriseHour               5
SunriseMin               37
SunsetHour               18
SunsetMin                 4
Name: 0, dtype: object

df_keep.iloc[-1]

Year                    2018
Month                      9
Day                       15
Hour                      23
SunlightMin              747
HourlySkyWeather         CLR
HourlyVisibility         NaN
HourlyWindDirection    False
HourlyWindSpeed          NaN
SunriseHour                5
SunriseMin                37
SunsetHour                18
SunsetMin                  4
Name: 11958, dtype: object

df_keep['Year'].groupby(by = df_keep['Hour']).count()

Hour
0     485
1     507
2     491
3     496
4     507
5     537
6     533
7     506
8     489
9     496
10    465
11    492
12    474
13    460
14    455
15    462
16    463
17    475
18    453
19    446
20    466
21    474
22    470
23    857
Name: Year, dtype: int64

Something weird might be going on with the time features...

df_keep[df_keep['Hour'] == 23].head()

df_keep[df_keep['Hour'] == 22].head()

There are duplicate rows!

df_keep['HourlySkyWeather'].groupby(by = [df_keep['Year'], df_keep['Month'], df_keep['Day'], df_keep['Hour']]).count()

Year  Month  Day  Hour
2017  9      16   0       1
                  1       1
                  2       1
                  3       2
                  4       3
                  5       2
                  6       2
                  7       3
                  8       2
                  9       1
                  10      4
                  11      2
                  12      2
                  13      1
                  14      1
                  15      1
                  16      1
                  17      1
                  18      1
                  19      1
                  20      1
                  21      1
                  22      2
                  23      3
             17   0       1
                  1       2
                  2       4
                  3       4
                  4       3
                  5       4
                         ..
2018  9      14   16      1
                  17      1
                  18      1
                  19      1
                  20      1
                  21      1
                  22      1
                  23      2
             15   0       4
                  1       1
                  2       1
                  3       1
                  4       1
                  5       1
                  6       1
                  7       3
                  8       2
                  9       2
                  10      2
                  11      1
                  12      2
                  13      1
                  14      1
                  15      1
                  16      1
                  19      1
                  20      1
                  21      1
                  22      1
                  23      1
Name: HourlySkyWeather, Length: 8758, dtype: int64

df_dedup = df_keep.drop_duplicates(subset = ['Year', 'Month', 'Day', 'Hour'], keep = 'first')

df_dedup['HourlySkyWeather'].groupby(by = [df_keep['Year'], df_keep['Month'], df_keep['Day'], df_keep['Hour']]).count()

Year  Month  Day  Hour
2017  9      16   0       1
                  1       1
                  2       1
                  3       1
                  4       1
                  5       1
                  6       1
                  7       1
                  8       1
                  9       1
                  10      1
                  11      1
                  12      1
                  13      1
                  14      1
                  15      1
                  16      1
                  17      1
                  18      1
                  19      1
                  20      1
                  21      1
                  22      1
                  23      1
             17   0       1
                  1       1
                  2       1
                  3       1
                  4       1
                  5       1
                         ..
2018  9      14   16      1
                  17      1
                  18      1
                  19      1
                  20      1
                  21      1
                  22      1
                  23      1
             15   0       1
                  1       1
                  2       1
                  3       1
                  4       1
                  5       1
                  6       1
                  7       1
                  8       1
                  9       1
                  10      1
                  11      1
                  12      1
                  13      1
                  14      1
                  15      1
                  16      1
                  19      1
                  20      1
                  21      1
                  22      1
                  23      1
Name: HourlySkyWeather, Length: 8758, dtype: int64

df_dedup.shape[0] / 24

364.9166666666667

Might be missing some data but I'll try using df_dedup

df_dedup.to_csv('../resources/weatherData.csv', index = False)

df_dedup['SunlightMin'].describe()

count    8758.000000
mean      732.412651
std       120.662506
min       554.000000
25%       616.000000
50%       734.000000
75%       849.000000
max       906.000000
Name: SunlightMin, dtype: float64

24 * 60 #Minutes in a day

1440

1440 - df_dedup['SunlightMin'].max() #Minimum minutes of darkness

534

1440 - df_dedup['SunlightMin'].min() #Maximum minutes of darkness

886

df_dedup['SunriseHour'].describe()

count    8758.000000
mean        5.353505
std         1.082291
min         4.000000
25%         4.000000
50%         5.000000
75%         6.000000
max         7.000000
Name: SunriseHour, dtype: float64

df_dedup['SunsetHour'].describe()

count    8758.000000
mean       17.545102
std         1.113604
min        16.000000
25%        17.000000
50%        18.000000
75%        19.000000
max        19.000000
Name: SunsetHour, dtype: float64

set(df_dedup['HourlyVisibility'])

{0.0,
 0.25,
 0.5,
 0.75,
 1.0,
 1.25,
 1.5,
 1.75,
 10.0,
 2.0,
 2.5,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 'NaN'}

set(df_dedup['HourlySkyWeather'])

{'BKN', 'CLR', 'FEW', 'NaN', 'OVC', 'SCT', 'VV'}

	STATION	STATION_NAME	ELEVATION	LATITUDE	LONGITUDE	DATE	REPORTTPYE	HOURLYSKYCONDITIONS	HOURLYVISIBILITY	HOURLYPRSENTWEATHERTYPE	...	MonthlyMaxSeaLevelPressureTime	MonthlyMinSeaLevelPressureValue	MonthlyMinSeaLevelPressureDate	MonthlyMinSeaLevelPressureTime	MonthlyTotalHeatingDegreeDays	MonthlyTotalCoolingDegreeDays	MonthlyDeptFromNormalHeatingDD	MonthlyDeptFromNormalCoolingDD	MonthlyTotalSeasonToDateHeatingDD	MonthlyTotalSeasonToDateCoolingDD
0	WBAN:94728	NY CITY CENTRAL PARK NY US	42.7	40.77898	-73.96925	2017-09-16 00:51	FM-15	CLR:00	7.00	NaN	...	-9999	NaN	-9999	-9999	NaN	NaN	NaN	NaN	NaN	NaN
1	WBAN:94728	NY CITY CENTRAL PARK NY US	42.7	40.77898	-73.96925	2017-09-16 01:51	FM-15	CLR:00	6.00	BR:1 \|\|	...	-9999	NaN	-9999	-9999	NaN	NaN	NaN	NaN	NaN	NaN
2	WBAN:94728	NY CITY CENTRAL PARK NY US	42.7	40.77898	-73.96925	2017-09-16 02:51	FM-15	CLR:00	4.00	BR:1 \|\|	...	-9999	NaN	-9999	-9999	NaN	NaN	NaN	NaN	NaN	NaN
3	WBAN:94728	NY CITY CENTRAL PARK NY US	42.7	40.77898	-73.96925	2017-09-16 03:51	FM-15	CLR:00	3.00	BR:1 \|\|	...	-9999	NaN	-9999	-9999	NaN	NaN	NaN	NaN	NaN	NaN
4	WBAN:94728	NY CITY CENTRAL PARK NY US	42.7	40.77898	-73.96925	2017-09-16 03:59	FM-16	CLR:00	2.50	BR:1 \|\|	...	-9999	NaN	-9999	-9999	NaN	NaN	NaN	NaN	NaN	NaN

	Year	Month	Day	Hour	SunlightMin	HourlySkyWeather	HourlyVisibility	HourlyWindDirection	HourlyWindSpeed	SunriseHour	SunriseMin	SunsetHour	SunsetMin
37	2017	9	16	23	744	SCT	5	0	0	5	38	18	2
38	2017	9	16	23	744	FEW	5	0	0	5	38	18	2
39	2017	9	16	23	744	NaN	NaN	False	NaN	5	38	18	2
87	2017	9	17	23	741	OVC	7	0	0	5	39	18	0
88	2017	9	17	23	741	BKN	7	60	3	5	39	18	0