#Let's import some modules:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import os
#Setting all graphs to be a specific color and size
sns.set_palette('Set3')
plt.rcParams['figure.figsize']=10,6
#Read in the 13 CSV files:
folder = '../f1db_csv/'
filelist = [file for file in os.listdir(folder) if file.endswith('.csv')]
for file in filelist:
exec("%s = pd.read_csv('%s')" % (file.split('.')[0], os.path.join(folder,file)))
#Check the dataframes were created correctly:
filelist
['circuits.csv', 'status.csv', 'lap_times.csv', 'drivers.csv', 'races.csv', 'constructors.csv', 'constructor_standings.csv', 'qualifying.csv', 'driver_standings.csv', 'constructor_results.csv', 'pit_stops.csv', 'seasons.csv', 'results.csv']
#Merging into 1 dataframe:
f1 = results.copy()
f1 = f1.merge(constructors, how='left', on='constructorId')
f1 = f1.merge(races, how='left', on='raceId')
f1 = f1.merge(circuits, how='left', on='circuitId')
f1 = f1.merge(drivers, how='left', on='driverId')
f1 = f1.merge(status, how='left', on='statusId')
f1.head()
resultId | raceId | driverId | constructorId | number_x | grid | position | positionText | positionOrder | points | laps | time_x | milliseconds | fastestLap | rank | fastestLapTime | fastestLapSpeed | statusId | constructorRef | name_x | nationality_x | url_x | year | round | circuitId | name_y | date | time_y | url_y | circuitRef | name | location | country | lat | lng | alt | url_x | driverRef | number_y | code | forename | surname | dob | nationality_y | url_y | status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 18 | 1 | 1 | 22 | 1 | 1 | 1 | 1 | 10.0 | 58 | 1:34:50.616 | 5690616 | 39 | 2 | 1:27.452 | 218.300 | 1 | mclaren | McLaren | British | http://en.wikipedia.org/wiki/McLaren | 2008 | 1 | 1 | Australian Grand Prix | 2008-03-16 | 04:30:00 | http://en.wikipedia.org/wiki/2008_Australian_G... | albert_park | Albert Park Grand Prix Circuit | Melbourne | Australia | -37.8497 | 144.968 | 10 | http://en.wikipedia.org/wiki/Melbourne_Grand_P... | hamilton | 44 | HAM | Lewis | Hamilton | 1985-01-07 | British | http://en.wikipedia.org/wiki/Lewis_Hamilton | Finished |
1 | 2 | 18 | 2 | 2 | 3 | 5 | 2 | 2 | 2 | 8.0 | 58 | +5.478 | 5696094 | 41 | 3 | 1:27.739 | 217.586 | 1 | bmw_sauber | BMW Sauber | German | http://en.wikipedia.org/wiki/BMW_Sauber | 2008 | 1 | 1 | Australian Grand Prix | 2008-03-16 | 04:30:00 | http://en.wikipedia.org/wiki/2008_Australian_G... | albert_park | Albert Park Grand Prix Circuit | Melbourne | Australia | -37.8497 | 144.968 | 10 | http://en.wikipedia.org/wiki/Melbourne_Grand_P... | heidfeld | \N | HEI | Nick | Heidfeld | 1977-05-10 | German | http://en.wikipedia.org/wiki/Nick_Heidfeld | Finished |
2 | 3 | 18 | 3 | 3 | 7 | 7 | 3 | 3 | 3 | 6.0 | 58 | +8.163 | 5698779 | 41 | 5 | 1:28.090 | 216.719 | 1 | williams | Williams | British | http://en.wikipedia.org/wiki/Williams_Grand_Pr... | 2008 | 1 | 1 | Australian Grand Prix | 2008-03-16 | 04:30:00 | http://en.wikipedia.org/wiki/2008_Australian_G... | albert_park | Albert Park Grand Prix Circuit | Melbourne | Australia | -37.8497 | 144.968 | 10 | http://en.wikipedia.org/wiki/Melbourne_Grand_P... | rosberg | 6 | ROS | Nico | Rosberg | 1985-06-27 | German | http://en.wikipedia.org/wiki/Nico_Rosberg | Finished |
3 | 4 | 18 | 4 | 4 | 5 | 11 | 4 | 4 | 4 | 5.0 | 58 | +17.181 | 5707797 | 58 | 7 | 1:28.603 | 215.464 | 1 | renault | Renault | French | http://en.wikipedia.org/wiki/Renault_in_Formul... | 2008 | 1 | 1 | Australian Grand Prix | 2008-03-16 | 04:30:00 | http://en.wikipedia.org/wiki/2008_Australian_G... | albert_park | Albert Park Grand Prix Circuit | Melbourne | Australia | -37.8497 | 144.968 | 10 | http://en.wikipedia.org/wiki/Melbourne_Grand_P... | alonso | 14 | ALO | Fernando | Alonso | 1981-07-29 | Spanish | http://en.wikipedia.org/wiki/Fernando_Alonso | Finished |
4 | 5 | 18 | 5 | 1 | 23 | 3 | 5 | 5 | 5 | 4.0 | 58 | +18.014 | 5708630 | 43 | 1 | 1:27.418 | 218.385 | 1 | mclaren | McLaren | British | http://en.wikipedia.org/wiki/McLaren | 2008 | 1 | 1 | Australian Grand Prix | 2008-03-16 | 04:30:00 | http://en.wikipedia.org/wiki/2008_Australian_G... | albert_park | Albert Park Grand Prix Circuit | Melbourne | Australia | -37.8497 | 144.968 | 10 | http://en.wikipedia.org/wiki/Melbourne_Grand_P... | kovalainen | \N | KOV | Heikki | Kovalainen | 1981-10-19 | Finnish | http://en.wikipedia.org/wiki/Heikki_Kovalainen | Finished |
f1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 24880 entries, 0 to 24879 Data columns (total 46 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 resultId 24880 non-null int64 1 raceId 24880 non-null int64 2 driverId 24880 non-null int64 3 constructorId 24880 non-null int64 4 number_x 24880 non-null object 5 grid 24880 non-null int64 6 position 24880 non-null object 7 positionText 24880 non-null object 8 positionOrder 24880 non-null int64 9 points 24880 non-null float64 10 laps 24880 non-null int64 11 time_x 24880 non-null object 12 milliseconds 24880 non-null object 13 fastestLap 24880 non-null object 14 rank 24880 non-null object 15 fastestLapTime 24880 non-null object 16 fastestLapSpeed 24880 non-null object 17 statusId 24880 non-null int64 18 constructorRef 24880 non-null object 19 name_x 24880 non-null object 20 nationality_x 24880 non-null object 21 url_x 24880 non-null object 22 year 24880 non-null int64 23 round 24880 non-null int64 24 circuitId 24880 non-null int64 25 name_y 24880 non-null object 26 date 24880 non-null object 27 time_y 24880 non-null object 28 url_y 24880 non-null object 29 circuitRef 24880 non-null object 30 name 24880 non-null object 31 location 24880 non-null object 32 country 24880 non-null object 33 lat 24880 non-null float64 34 lng 24880 non-null float64 35 alt 24880 non-null object 36 url_x 24880 non-null object 37 driverRef 24880 non-null object 38 number_y 24880 non-null object 39 code 24880 non-null object 40 forename 24880 non-null object 41 surname 24880 non-null object 42 dob 24880 non-null object 43 nationality_y 24880 non-null object 44 url_y 24879 non-null object 45 status 24880 non-null object dtypes: float64(3), int64(11), object(32) memory usage: 8.9+ MB
#replacing any empty vales with NaN
f1.replace(r'\N', np.nan, inplace=True)
#Focusing the dataframe:
f1 = f1[['year', 'name', 'date', 'driverRef', 'name_x', 'grid',
'position', 'points', 'laps', 'time_x',
'milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
'fastestLapSpeed', 'round',
'dob', 'nationality_y', 'status']]
f1.position = f1.position.astype(float)
f1 = f1.set_index('date')
f1 = f1.rename(columns={'name_x':'constructor', 'time_x': 'time', 'nationality_y':'nationality'})
f1 = f1.sort_values('year')
f1.head()
year | name | driverRef | constructor | grid | position | points | laps | time | milliseconds | fastestLap | rank | fastestLapTime | fastestLapSpeed | round | dob | nationality | status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
date | ||||||||||||||||||
1950-05-13 | 1950 | Silverstone Circuit | claes | Talbot-Lago | 21 | 11.0 | 0.0 | 64 | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 1916-08-11 | Belgian | +6 Laps |
1950-06-18 | 1950 | Circuit de Spa-Francorchamps | farina | Alfa Romeo | 1 | 4.0 | 4.0 | 35 | +4:05.0 | 10291000 | NaN | NaN | NaN | NaN | 5 | 1906-10-30 | Italian | Finished |
1950-06-18 | 1950 | Circuit de Spa-Francorchamps | ascari | Ferrari | 7 | 5.0 | 2.0 | 34 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 1918-07-13 | Italian | +1 Lap |
1950-06-18 | 1950 | Circuit de Spa-Francorchamps | villoresi | Ferrari | 4 | 6.0 | 0.0 | 33 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 1909-05-16 | Italian | +2 Laps |
1950-06-18 | 1950 | Circuit de Spa-Francorchamps | levegh | Talbot-Lago | 10 | 7.0 | 0.0 | 33 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 1905-12-22 | French | +2 Laps |
f1.info()
<class 'pandas.core.frame.DataFrame'> Index: 24880 entries, 1950-05-13 to 2020-11-01 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 year 24880 non-null int64 1 name 24880 non-null object 2 driverRef 24880 non-null object 3 constructor 24880 non-null object 4 grid 24880 non-null int64 5 position 14154 non-null float64 6 points 24880 non-null float64 7 laps 24880 non-null int64 8 time 6546 non-null object 9 milliseconds 6545 non-null object 10 fastestLap 6474 non-null object 11 rank 6631 non-null object 12 fastestLapTime 6474 non-null object 13 fastestLapSpeed 6474 non-null object 14 round 24880 non-null int64 15 dob 24880 non-null object 16 nationality 24880 non-null object 17 status 24880 non-null object dtypes: float64(2), int64(4), object(12) memory usage: 3.6+ MB
most_races = f1.driverRef.value_counts().head().copy()
most_races = pd.DataFrame(most_races)
most_races = most_races.reset_index()
most_races.columns = ['driver', 'number_of_races']
most_races
driver | number_of_races | |
---|---|---|
0 | raikkonen | 328 |
1 | barrichello | 326 |
2 | alonso | 314 |
3 | button | 309 |
4 | michael_schumacher | 308 |
sns.barplot(x='driver', y='number_of_races', data=most_races)
plt.title('Who has the most races in F1?', fontsize=20)
plt.ylabel('Number of Races', fontsize=15)
plt.xlabel('Driver', fontsize=15)
plt.xticks(np.arange(5), most_races.driver.str.capitalize(), fontsize=11);
most_years = f1[['year', 'driverRef']].copy()
most_years = most_years.drop_duplicates(['year', 'driverRef'])[['year', 'driverRef']]
most_years = pd.DataFrame(most_years)
most_years = most_years.value_counts('driverRef').reset_index().head(5)
most_years.columns = ['driver', 'number_of_years']
most_years.style.hide_index()
driver | number_of_years |
---|---|
barrichello | 19 |
michael_schumacher | 19 |
hill | 18 |
button | 18 |
raikkonen | 18 |
sns.barplot(x='driver', y='number_of_years', data=most_years)
plt.title('Who raced for the longest in F1?', fontsize=20)
plt.ylabel('Number of Years Racing', fontsize=15)
plt.xlabel('Driver', fontsize=15)
plt.xticks(np.arange(5), most_races.driver.str.capitalize(), fontsize=11)
;
''
track_use = f1[['year', 'name']].copy()
track_use = track_use.drop_duplicates(['year', 'name'])[['year', 'name']]
track_use = pd.DataFrame(track_use)
track_use = track_use.value_counts('name').reset_index()
track_use.columns = ['circuit', 'years_used']
most_track_use = track_use.copy().head()
most_track_use.style.hide_index()
circuit | years_used |
---|---|
Autodromo Nazionale di Monza | 70 |
Circuit de Monaco | 66 |
Silverstone Circuit | 54 |
Circuit de Spa-Francorchamps | 53 |
Nürburgring | 41 |
sns.barplot(x='circuit', y='years_used', data=most_track_use )
plt.title('Circuit used for the most years in F1', fontsize=20)
plt.ylabel('Number of Years Used', fontsize=15)
plt.xlabel('Circuit Name', labelpad=12, fontsize=15)
plt.xticks(np.arange(5), most_races.driver.str.capitalize(), rotation=15, size=11);
a = track_use[track_use['years_used']==1]
least = f1.copy()
least.columns
Index(['year', 'name', 'driverRef', 'constructor', 'grid', 'position', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'round', 'dob', 'nationality', 'status'], dtype='object')
least = a.merge(least, how='left', left_on='circuit', right_on='name')
least = least[['year','circuit', 'years_used']].drop_duplicates().sort_values('year')
least.style.hide_index()
year | circuit | years_used |
---|---|---|
1957 | Pescara Circuit | 1 |
1958 | Ain Diab | 1 |
1959 | Sebring International Raceway | 1 |
1959 | AVUS | 1 |
1959 | Monsanto Park Circuit | 1 |
1960 | Riverside International Raceway | 1 |
1964 | Zeltweg | 1 |
1967 | Le Mans | 1 |
1984 | Fair Park | 1 |
1993 | Donington Park | 1 |
2020 | Autodromo Internazionale del Mugello | 1 |
2020 | Autódromo Internacional do Algarve | 1 |
fastest_lap = f1[['name', 'fastestLapTime', 'driverRef']].copy()
fastest_lap = fastest_lap.sort_values(by='fastestLapTime')
fastest_lap = fastest_lap.drop_duplicates(['name'])[['name', 'fastestLapTime', 'driverRef']].head(5)
fastest_lap
name | fastestLapTime | driverRef | |
---|---|---|---|
date | |||
2020-07-12 | Red Bull Ring | 1:05.619 | sainz |
2004-06-20 | Indianapolis Motor Speedway | 1:10.399 | barrichello |
2018-11-11 | Autódromo José Carlos Pace | 1:10.540 | bottas |
2019-06-09 | Circuit Gilles Villeneuve | 1:13.078 | bottas |
2004-07-25 | Hockenheimring | 1:13.780 | raikkonen |
fastest_lap['fastestLapTimeSec'] = (fastest_lap['fastestLapTime'].str.split(':', expand = True)[0].astype('float64') * 60) + fastest_lap['fastestLapTime'].str.split(':', expand = True)[1].astype('float64')
fastest_lap
name | fastestLapTime | driverRef | fastestLapTimeSec | |
---|---|---|---|---|
date | ||||
2020-07-12 | Red Bull Ring | 1:05.619 | sainz | 65.619 |
2004-06-20 | Indianapolis Motor Speedway | 1:10.399 | barrichello | 70.399 |
2018-11-11 | Autódromo José Carlos Pace | 1:10.540 | bottas | 70.540 |
2019-06-09 | Circuit Gilles Villeneuve | 1:13.078 | bottas | 73.078 |
2004-07-25 | Hockenheimring | 1:13.780 | raikkonen | 73.780 |
sns.barplot(x='name', y='fastestLapTimeSec', data=fastest_lap)
plt.title('Which tracks have the fastest lap times in F1?', fontsize=20)
plt.ylabel('Lap Times (Sec)', fontsize=15)
plt.xlabel('Circuit', fontsize=15)
plt.xticks(rotation=15, fontsize=11);