import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
import json 
import seaborn as sns
import scipy.stats as st

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, classification_report
from sklearn.model_selection import GridSearchCV


# create connection to sqlite file
con = sqlite3.connect("database.sqlite")

# get tables
sql_query = "SELECT name FROM sqlite_master WHERE type='table';"
cursor = con.cursor()
cursor.execute(sql_query)
print(cursor.fetchall())

[('station',), ('status',), ('trip',), ('weather',)]


# Extract data from sqlite file with SQL queries
station = pd.read_sql_query("SELECT * from station", con)
trip = pd.read_sql_query("SELECT * from trip", con)
weather = pd.read_sql_query("SELECT * from weather", con)

# if you would like to try out reading from csv files, you can try uncommenting the code segment below!
#station = pd.read_csv("archive/station.csv")
#trip = pd.read_csv("archive/trip.csv")
#weather = pd.read_csv("archive/weather.csv")


# Output station table info
print(station.dtypes)
station.head()

id                     int64
name                  object
lat                  float64
long                 float64
dock_count             int64
city                  object
installation_date     object
dtype: object


# Output trip table info
print(trip.dtypes)
trip.head()

id                     int64
duration               int64
start_date            object
start_station_name    object
start_station_id       int64
end_date              object
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object


# Output weather table info
print(weather.dtypes)
weather.head()

date                              object
max_temperature_f                 object
mean_temperature_f                object
min_temperature_f                 object
max_dew_point_f                   object
mean_dew_point_f                  object
min_dew_point_f                   object
max_humidity                      object
mean_humidity                     object
min_humidity                      object
max_sea_level_pressure_inches     object
mean_sea_level_pressure_inches    object
min_sea_level_pressure_inches     object
max_visibility_miles              object
mean_visibility_miles             object
min_visibility_miles              object
max_wind_Speed_mph                object
mean_wind_speed_mph               object
max_gust_speed_mph                object
precipitation_inches              object
cloud_cover                       object
events                            object
wind_dir_degrees                  object
zip_code                           int64
dtype: object


# Clean for missing data, remove any rows with missing data
station = station.dropna()
trip = trip.dropna()

# Convert weather data to numeric values
weather = weather.apply(pd.to_numeric, errors='ignore')
weather['precipitation_inches'] = pd.to_numeric(weather['precipitation_inches'], errors='coerce')

# Convert dates from string to datetime objects
trip['start_date'] = pd.to_datetime(trip['start_date'])
trip['end_date'] = pd.to_datetime(trip['end_date'])
weather['date'] = pd.to_datetime(weather['date'])
trip["date"] = pd.to_datetime(trip["start_date"]).dt.date

# Add a column for start date without time
trip["date"] = trip['start_date'].apply(lambda x: x.date())
trip['date'] = pd.to_datetime(trip['date'])

# Binning by month using the start date
trip['month'] = trip['start_date'].apply(lambda x: x.month)


# Create a map and center it
map_osm = folium.Map(location=[37.580000, -122.28000], zoom_start=10)

# Mark all the stations on map
for index, station_info in station.iterrows():
    tooltip_text = station_info['name'] + "\n"
    folium.Marker(location=[station_info["lat"], station_info["long"]], tooltip=tooltip_text).add_to(map_osm)
    
map_osm


trip.describe()


trip['duration'] = trip['duration'].apply(lambda x: x / 60)
trip.describe()


# histogram of the biking time duration for trips of length 1 hour
# Label points and change plot size
fig, ax = plt.subplots(figsize = (9, 6))
plt.hist(trip['duration'], bins = 60, range=[0, 1 * 60])

# Graph labels and titles
plt.title('Distribution of duration of bike trips that are at most 1 hour')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.show()


# number of bike trips per month
# Label points and change plot size
fig, ax = plt.subplots(figsize = (9, 6))
plt.bar(range(1, 13), np.bincount(trip['month'], minlength=13)[1:], color='red')

# Graph labels and titles
plt.title('Bike trips per month')
plt.xlabel('Month')
plt.ylabel('Number of trips')
ax.set_xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
plt.show()


# Show distribution of bike duration per month via box plots

ax = trip[['month', 'duration']].boxplot(by='month', grid=False, showfliers=False, figsize=(9, 6), return_type='axes')
plt.title("Duration distirbution by month")
plt.suptitle('')
plt.xlabel('Month')
plt.ylabel('Duration (min)')
plt.show()


trip['subscription_type'].value_counts()
566746/(566746+103213)

0.8459413187971204


weather['zip_code'].value_counts()

94107    733
94063    733
94301    733
94041    733
95113    733
Name: zip_code, dtype: int64


trip['zip_code'].value_counts()

94107    78704
94105    42672
94133    31359
94103    26673
94111    21409
         ...  
44145        1
60131        1
3300         1
8033         1
9069         1
Name: zip_code, Length: 7440, dtype: int64


# load GeoJson

zipcode_map_data= 0

with open('Bay Area ZIP Codes.geojson', 'r') as jsonFile:
    zipcode_map_data = json.load(jsonFile)


# create a list of zipcodes in weather 
zip_list = ['94041', '95113', '94107', '94063', '94301']

# extract zip boundaries for the zipcodes we have
new_zip_map_data = []
for i in range(len(zipcode_map_data['features'])):
    if zipcode_map_data['features'][i]['properties']['zip'] in zip_list:
        new_zip_map_data.append(zipcode_map_data['features'][i])

# create new JSON
zipcode_map_data2 = dict.fromkeys(['type', 'features'])
zipcode_map_data2['type'] = 'FeatureCollection'
zipcode_map_data2['features'] = new_zip_map_data


# function mark all the stations per region
def add_marker_for_region(region):
    for index, station_info in station.iterrows():
        if station_info['city'] == region:
            tooltip_text = station_info['name'] + "\n"
            folium.Marker(location=[station_info["lat"], station_info["long"]], tooltip=tooltip_text).add_to(map_osm)   
    
# Map San Francisco 
map_osm = folium.Map(location=[37.7749, -122.4194], zoom_start=13)
add_marker_for_region('San Francisco')

map_osm.choropleth(
    geo_data = zipcode_map_data2,
    fill_opacley = 0.5,
)

map_osm


# Map San Jose
map_osm = folium.Map(location=[37.3387, -121.8853], zoom_start=13)
add_marker_for_region('San Jose')

map_osm.choropleth(
    geo_data = zipcode_map_data2,
    fill_opacley = 0.5,
)

map_osm


# Map Redwood City
map_osm = folium.Map(location=[37.4848, -122.2281], zoom_start=13)
add_marker_for_region('Redwood City')

map_osm.choropleth(
    geo_data = zipcode_map_data2,
    fill_opacley = 0.5,
)

map_osm


# Map Mountain View
map_osm = folium.Map(location=[37.3861, -122.0839], zoom_start=13)
add_marker_for_region('Mountain View')

map_osm.choropleth(
    geo_data = zipcode_map_data2,
    fill_opacley = 0.5,
)

map_osm


# Map Palo Alto
map_osm = folium.Map(location=[37.4419, -122.1430], zoom_start=13)
add_marker_for_region('Palo Alto')

map_osm.choropleth(
    geo_data = zipcode_map_data2,
    fill_opacley = 0.5,
)

map_osm


# associate the zipcodes zip_list = ['94041', '95113', '94107', '94063', '94301'] to one of the 5 cities: 
# San Francisco, San Jose, Redwood City, Mountain View, and Palo Alto

city_dict1 = {'94041':'Mountain View', '95113':'San Jose', '94107':'San Francisco', '94063':'Redwood City', '94301':'Palo Alto'}
city_dict2 = {'Mountain View':'94041', 'San Jose':'95113', 'San Francisco':'94107', 'Redwood City':'94063', 'Palo Alto':'94301'}

# we add the city mapping to a zipcode
station['zipcode'] = station['city'].apply(lambda x: city_dict2[x])

# we add zipcode to city mapping in weather
weather['city'] = weather['zip_code'].apply(lambda x: city_dict1[str(x)])


# we assign a bin for each trip based on start location station

station_tmp =  station[['id', 'name', 'city', 'zipcode', 'dock_count']]

# we can use the bins as keys to merge all our data into one dataframe!
# merge station info into trip data
trip_tmp = pd.merge(left=trip,right=station_tmp, how = "inner", \
                    left_on = ['start_station_id', 'start_station_name'], right_on = ['id', 'name'])
trip_tmp.rename(columns={'id_x': 'trip_id', 'zipcode' : 'city_zip'}, inplace=True)

# remove duplicate column from merge key
trip_tmp = trip_tmp.drop('id_y', axis=1)
trip_tmp = trip_tmp.drop('name', axis=1)

# merge weather info into trip data
# convert weather zipcode to string to ensure data type alignment
weather['zip_code'] = weather['zip_code'].apply(lambda x: str(x))

trip_tmp = pd.merge(left=trip_tmp, right=weather, how = "inner", left_on=['date', 'city', 'city_zip'],\
                   right_on = ['date', 'city', 'zip_code'])
trip_tmp.rename(columns={'zip_code_x': 'trip_zip_code'}, inplace=True)
trip_tmp = trip_tmp.drop('zip_code_y', axis=1)

trip_all_data = trip_tmp.copy(deep=True)


# analyze where most people bike
trip_all_data['city'].value_counts().plot(kind="bar", title = "Number of bike trips in region", xlabel="city", ylabel="Trip count", color="red")
plt.show()


# Get trips per day

trips_by_date = trip.groupby(["date"])
trips = trips_by_date.size()


# Group data
weather_by_date = weather.groupby("date")

weather_by_date = weather_by_date.mean()
weather_by_date["trips"] = trips
# weather_by_date = weather_by_date.drop("zip_code", axis=1)
weather_by_date["day"] = weather_by_date.index
weather_by_date["day"] = pd.to_datetime(weather_by_date["day"])
weather_by_date["day"] = weather_by_date["day"].dt.dayofweek
weather_by_date.head()


# Get correlation info
factors = ['trips', 'mean_temperature_f', 'mean_dew_point_f','mean_humidity', 'mean_sea_level_pressure_inches', 
           'mean_visibility_miles', 'mean_wind_speed_mph', 'precipitation_inches', 'cloud_cover', 
           'wind_dir_degrees']
cor = weather_by_date[factors].corr()
cor = round(cor, 2)
sns.heatmap(cor, annot=True)

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Plot correlation graphs
sns.lmplot(x="mean_temperature_f", y="trips", data=weather_by_date).set(title="Trips by Temperature (F) Total")

for i in range(7):
    sns.lmplot(x="mean_temperature_f", y="trips", data=weather_by_date.loc[(weather_by_date["day"] == i)]).set(title="Tripes by Temperature (F) " + days[i])

sns.lmplot(x="precipitation_inches", y="trips", data=weather_by_date).set(title="Trips by Precipitation Inches")
sns.lmplot(x="mean_dew_point_f", y="trips", data=weather_by_date).set(title="Trips by Dew Point (F)")

<seaborn.axisgrid.FacetGrid at 0x7fad7c5590d0>


# Violin plots by day
sns.violinplot(data=weather_by_date, x="day", y="trips").set(title="Trips by Day of Week")

[Text(0.5, 1.0, 'Trips by Day of Week')]


sns.lmplot(x="mean_temperature_f", y="trips", data=weather_by_date, hue="day").set(title="Trips by Dew Point (F)")
plt.show()


# clean data
mldf = pd.merge(trip, weather, left_on=['date','zip_code'], right_on = ['date','zip_code'])
mldf["day"] = pd.to_datetime(mldf["start_date"]).dt.dayofweek
mldf = mldf.drop("max_gust_speed_mph", axis=1)
mldf = mldf.dropna()

mldf.head()


# get only numeric columns
numeric_mldf = mldf.select_dtypes(include=np.number)

plt.figure(figsize=(15, 12))
cor = mldf.corr()
cor = round(cor, 2)
sns.heatmap(cor, annot=True)

<AxesSubplot:>


weather_by_date['mean_temperature_f'].hist()

<AxesSubplot:>


weather_by_date['trips'].hist()

<AxesSubplot:>


x = np.array(weather_by_date['mean_temperature_f'])
y = np.array(weather_by_date['trips'])

# perform test
stat, p = st.spearmanr(x, y)

print("p-value: " + str(p))

p-value: 5.864017930805387e-17


# get x and y values
x = weather_by_date.drop(["trips", 'max_gust_speed_mph'],axis=1)
y = weather_by_date["trips"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

score = r2_score(y_test,y_pred)
print('R2 Score: ',score)
print('Mean Squared Error: ',mean_squared_error(y_test,y_pred))

R2 Score:  0.5321185386801174
Mean Squared Error:  86522.42552396748


# Cross validation
scores = cross_val_score(model, x, y, cv=10)

# report performance
print("%0.2f accuracy with a standard deviation (standard error) of %0.2f" % (scores.mean(), scores.std()))

0.39 accuracy with a standard deviation (standard error) of 0.28


# Set up input values x and y
x = numeric_mldf
y = mldf["subscription_type"]
# Divide into test and train sets
X_train, X_test, y_train, y_test=train_test_split(x, y, test_size=0.15)


# Used to narrow down the options for hyperparemter optmization (below) for time purposes 

# Iteration count
n_iters = [5, 10, 20, 50, 100]
scores = []
for n_iter in n_iters:
    clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=n_iter)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
print(scores)
print("Highest score: " + str(max(scores)) + " Iter: " + str(n_iters[scores.index(max(scores))]))

# Loss function
losses = ["hinge", "log", "modified_huber", "perceptron", "squared_hinge"]
scores = []
for loss in losses:
    clf = SGDClassifier(loss=loss, penalty="l2", max_iter=1000)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
print(scores)
print("Highest score: " + str(max(scores)) + " Function: " + str(losses[scores.index(max(scores))]))

# Alpha value
alphas = [0.0001, 0.001, 0.01, 0.1]
scores = []
for alpha in alphas:
    clf = SGDClassifier(loss="hinge", penalty="l2", alpha=alpha)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
print(scores)
print("Highest score: " + str(max(scores)) + " Alpha: " + str(alphas[scores.index(max(scores))]))

# Penalty
penalties = ["l2", "l1", "elasticnet", "none"]
scores = []
for penalty in penalties:
    clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=alpha)
    clf.fit(X_train, y_train)
    scores.append(clf.score(X_test, y_test))
print(scores)
print("Highest score: " + str(max(scores)) + " Penalty: " + str(penalties[scores.index(max(scores))]))

/Users/Licykk/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py:696: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
/Users/Licykk/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py:696: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
/Users/Licykk/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py:696: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
/Users/Licykk/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py:696: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(
/Users/Licykk/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_stochastic_gradient.py:696: ConvergenceWarning: Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.
  warnings.warn(

[0.9610909931427691, 0.9610909931427691, 0.9609368980661068, 0.9610909931427691, 0.9591648046844903]
Highest score: 0.9610909931427691 Iter: 5
[0.9610909931427691, 0.9579320440711919, 0.9598582325294707, 0.9611680406811003, 0.9618614685260806]
Highest score: 0.9618614685260806 Function: squared_hinge
[0.038909006857230914, 0.9542337622312967, 0.9613991832960936, 0.9611680406811003]
Highest score: 0.9613991832960936 Alpha: 0.01
[0.9499961476230835, 0.960551660374451, 0.9609368980661068, 0.9305031204253024]
Highest score: 0.9609368980661068 Penalty: elasticnet


# Hyperparemeter optimization
params = {
    "loss" : ["hinge", "modified_huber", "perceptron"],
    "alpha" : [0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}

clf = SGDClassifier(max_iter=1000)
grid = GridSearchCV(clf, param_grid=params, cv=10)

grid.fit(X_train, y_train)

print(grid.best_params_)

{'alpha': 0.01, 'loss': 'modified_huber', 'penalty': 'l2'}


clf = SGDClassifier(loss="modified_huber", penalty="l2", alpha=0.1)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('Accuracy: {:.4f}\n'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

Accuracy: 0.9619

              precision    recall  f1-score   support

    Customer       0.60      0.06      0.11       505
  Subscriber       0.96      1.00      0.98     12474

    accuracy                           0.96     12979
   macro avg       0.78      0.53      0.55     12979
weighted avg       0.95      0.96      0.95     12979


# Cross validation
scores = cross_val_score(clf, x, y, cv=10)

# report performance
print("%0.2f accuracy with a standard deviation (standard error) of %0.2f" % (scores.mean(), scores.std()))

0.88 accuracy with a standard deviation (standard error) of 0.24

	id	name	lat	long	dock_count	city	installation_date
0	2	San Jose Diridon Caltrain Station	37.329732	-121.901782	27	San Jose	8/6/2013
1	3	San Jose Civic Center	37.330698	-121.888979	15	San Jose	8/5/2013
2	4	Santa Clara at Almaden	37.333988	-121.894902	11	San Jose	8/6/2013
3	5	Adobe on Almaden	37.331415	-121.893200	19	San Jose	8/5/2013
4	6	San Pedro Square	37.336721	-121.894074	15	San Jose	8/7/2013

	id	duration	start_date	start_station_name	start_station_id	end_date	end_station_name	end_station_id	bike_id	subscription_type	zip_code
0	4069	174	8/29/2013 9:08	2nd at South Park	64	8/29/2013 9:11	2nd at South Park	64	288	Subscriber	94114
1	4073	1067	8/29/2013 9:24	South Van Ness at Market	66	8/29/2013 9:42	San Francisco Caltrain 2 (330 Townsend)	69	321	Subscriber	94703
2	4074	1131	8/29/2013 9:24	South Van Ness at Market	66	8/29/2013 9:43	San Francisco Caltrain 2 (330 Townsend)	69	317	Subscriber	94115
3	4075	1117	8/29/2013 9:24	South Van Ness at Market	66	8/29/2013 9:43	San Francisco Caltrain 2 (330 Townsend)	69	316	Subscriber	94122
4	4076	1118	8/29/2013 9:25	South Van Ness at Market	66	8/29/2013 9:43	San Francisco Caltrain 2 (330 Townsend)	69	322	Subscriber	94597

	id	duration	start_station_id	end_station_id	bike_id	month
count	669959.000000	6.699590e+05	669959.000000	669959.000000	669959.000000	669959.000000
mean	460382.009899	1.107950e+03	57.851876	57.837438	427.587620	6.476844
std	264584.458487	2.225544e+04	17.112474	17.200142	153.450988	3.276798
min	4069.000000	6.000000e+01	2.000000	2.000000	9.000000	1.000000
25%	231082.500000	3.440000e+02	50.000000	50.000000	334.000000	4.000000
50%	459274.000000	5.170000e+02	62.000000	62.000000	440.000000	7.000000
75%	692601.000000	7.550000e+02	70.000000	70.000000	546.000000	9.000000
max	913460.000000	1.727040e+07	84.000000	84.000000	878.000000	12.000000

	id	duration	start_station_id	end_station_id	bike_id	month
count	669959.000000	669959.000000	669959.000000	669959.000000	669959.000000	669959.000000
mean	460382.009899	18.465831	57.851876	57.837438	427.587620	6.476844
std	264584.458487	370.923950	17.112474	17.200142	153.450988	3.276798
min	4069.000000	1.000000	2.000000	2.000000	9.000000	1.000000
25%	231082.500000	5.733333	50.000000	50.000000	334.000000	4.000000
50%	459274.000000	8.616667	62.000000	62.000000	440.000000	7.000000
75%	692601.000000	12.583333	70.000000	70.000000	546.000000	9.000000
max	913460.000000	287840.000000	84.000000	84.000000	878.000000	12.000000

	max_temperature_f	mean_temperature_f	min_temperature_f	max_dew_point_f	mean_dew_point_f	min_dew_point_f	max_humidity	mean_humidity	min_humidity	max_sea_level_pressure_inches	...	mean_visibility_miles	min_visibility_miles	max_wind_Speed_mph	mean_wind_speed_mph	max_gust_speed_mph	precipitation_inches	cloud_cover	wind_dir_degrees	trips	day
date
2013-08-29	78.6	70.4	62.8	62.6	60.0	58.0	88.8	73.2	53.0	30.064	...	10.0	10.0	17.8	7.4	21.6	0.0	4.2	325.8	748	3
2013-08-30	84.6	73.0	62.4	63.4	59.8	55.8	90.8	68.8	42.4	30.054	...	10.0	9.4	19.4	5.6	23.0	0.0	2.0	181.8	714	4
2013-08-31	76.4	68.0	59.8	59.6	57.2	54.4	88.2	70.8	51.6	29.994	...	10.0	10.0	18.4	7.4	24.5	0.0	1.0	324.6	640	5
2013-09-01	79.2	70.0	61.0	61.2	57.2	53.6	83.4	66.8	44.4	29.962	...	10.0	10.0	18.6	7.2	21.8	0.0	2.2	318.8	706	6
2013-09-02	77.4	70.8	64.6	62.8	60.8	58.4	85.8	74.0	58.8	29.972	...	10.0	9.2	18.2	7.4	22.6	0.0	5.0	311.4	661	0

Contents¶

Introduction¶

Libraries Used¶

Dataset¶

Importing Data¶

Data Cleaning¶

Data Exploration and Preliminary Data Analysis¶

Hypothesis Testing¶

Predicting Number of Riders from Weather (Regression)¶

Predicting Subscriber Status from Ride Details (Classifcation)¶

Conclusion¶

	date	max_temperature_f	mean_temperature_f	min_temperature_f	max_dew_point_f	mean_dew_point_f	min_dew_point_f	max_humidity	mean_humidity	min_humidity	...	mean_visibility_miles	min_visibility_miles	max_wind_Speed_mph	mean_wind_speed_mph	max_gust_speed_mph	cloud_cover	wind_dir_degrees	zip_code
0	8/29/2013	74	68	61	61	58	56	93	75	57	...	10	10	23	11	28	4	286	94107
1	8/30/2013	78	69	60	61	58	56	90	70	50	...	10	7	29	13	35	2	291	94107
2	8/31/2013	71	64	57	57	56	54	93	75	57	...	10	10	26	15	31	4	284	94107
3	9/1/2013	74	66	58	60	56	53	87	68	49	...	10	10	25	13	29	4	284	94107
4	9/2/2013	75	69	62	61	60	58	93	77	61	...	10	6	23	12	30	6	277	94107

	id	duration	start_date	start_station_name	start_station_id	end_date	end_station_name	end_station_id	bike_id	subscription_type	...	mean_visibility_miles	min_visibility_miles	max_wind_Speed_mph	mean_wind_speed_mph	cloud_cover	wind_dir_degrees	city	day
0	4275	14.716667	2013-08-29 11:43:00	Mechanics Plaza (Market at Battery)	75	2013-08-29 11:57:00	South Van Ness at Market	66	626	Subscriber	...	10.0	10.0	23.0	11.0	4.0	286.0	San Francisco	3
1	4290	5.150000	2013-08-29 11:58:00	Powell Street BART	39	2013-08-29 12:03:00	Market at 10th	67	496	Customer	...	10.0	10.0	23.0	11.0	4.0	286.0	San Francisco	3
2	4319	14.983333	2013-08-29 12:10:00	South Van Ness at Market	66	2013-08-29 12:25:00	Beale at Market	56	546	Subscriber	...	10.0	10.0	23.0	11.0	4.0	286.0	San Francisco	3
3	4442	7.900000	2013-08-29 12:56:00	Washington at Kearney	46	2013-08-29 13:04:00	Market at Sansome	77	607	Subscriber	...	10.0	10.0	23.0	11.0	4.0	286.0	San Francisco	3
4	4454	5.666667	2013-08-29 13:05:00	Market at Sansome	77	2013-08-29 13:10:00	Washington at Kearney	46	590	Subscriber	...	10.0	10.0	23.0	11.0	4.0	286.0	San Francisco	3

Bike Share System Usage¶

Contents¶

Introduction¶

Libraries Used¶

Dataset¶

Importing Data¶

Data Cleaning¶

Data Exploration and Preliminary Data Analysis¶

Hypothesis Testing¶

Predicting Number of Riders from Weather (Regression)¶

Predicting Subscriber Status from Ride Details (Classifcation)¶

Conclusion¶