Exploring Fire incident Data¶
In [1]:
import pandas as pd
import os
from pandas_profiling import ProfileReport
import openpyxl
import numpy as np
print("packages are installed successfully!")
In [ ]:
## % 'magic' command : ensures we can create charts directly beneath the cells in the notebook
%matplotlib inline
In [71]:
import importlib ### OPTIONAL RELOAD to ensure the package is loaded when we need it
importlib.reload(openpyxl)
importlib.reload(np)
Out[71]:
In [2]:
## visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Print the versions of Seaborn and Matplotlib
print("Seaborn version:", sns.__version__)
print("Matplotlib version:", plt.matplotlib.__version__)
In [ ]:
### Load the spread sheet
### Define path for notebook and where the data folder is located
current_dir = os.getcwd()
folder_name= "Data"
data_dir= os.path.join(current_dir,folder_name)
file_name= "Naperville_2021_Fire_Incident_Calls.xlsx"
file_path = os.path.join(data_dir,file_name)
### check if we need to make a data folder
if not os.path.exists(data_dir):
os.mkdir(data_dir)
print("Folder Created")
else:
print("Folder already exists")
### Read in the spreadsheet
input_data = pd.read_excel(file_path)
input_data
Out[ ]:
In [ ]:
### looks like the coordinates are the X and Y , then we have address data fields, incident times and personnel counts
In [32]:
### get sense of the data shape
input_data.shape
Out[32]:
In [ ]:
### 5601 rows, 77 columns
In [ ]:
### Can manually check for missing values
missing_values = input_data.isnull()
print("Missing Values:", missing_values)
In [ ]:
### need to sum those True/False
In [ ]:
### Can manually check for missing values
missing_values = input_data.isnull().sum().sort_values(ascending=False)
print(missing_values.head(40)) ## display set number of fields
In [ ]:
### Some fields are missing most of the data e.g., Hazardous Materials Code
In [ ]:
### Can manually check for missing values
missing_values = input_data.isnull().sum().sum()
print(missing_values) ## sum all missing values across all fields
In [ ]:
### Get more details on the table field names, missing values (null counts) and data types
input_data.info()
In [ ]:
### we do have some columns with missing values, like incident street suffix code where all are missing (i.e., 0 non-null)
In [ ]:
## We can get a sense of the counts within any variable manually
input_data["Incident Type Description"].value_counts(normalize=True)*100
Out[ ]:
In [117]:
input_data["Incident Type Description"].describe()
Out[117]:
In [ ]:
### We can summarize any variable but it would take a long time to do it thoroughly for all of them
Pandas Profiling¶
- a low-code library that allows us to perform automated exploratory data analysis
- By typing one line of code we produce an entire report of useful statistical and visual analyses
- The output is a very useful and organized HTML file that we can visualize directly in the notebook or share
In [8]:
### Generate report of table summary and stats
table_report = ProfileReport(input_data, title= 'Fire Incident Report')
In [9]:
table_report