Exploring Minnesota Fire Incident Data

Exploring Fire incident Data

In [1]:
import pandas as pd
import os 
from pandas_profiling import ProfileReport
import openpyxl
import numpy as np
print("packages are installed successfully!")
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[1], line 3
      1 import pandas as pd
      2 import os 
----> 3 from pandas_profiling import ProfileReport
      4 import openpyxl
      5 import numpy as np

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\__init__.py:22
     16 from pandas_profiling.utils.paths import (
     17     get_config_default,
     18     get_project_root,
     19     get_config_minimal,
     20 )
     21 from pandas_profiling.config import config
---> 22 from pandas_profiling.controller import pandas_decorator
     23 from pandas_profiling.model.describe import describe as describe_df
     24 from pandas_profiling.report import get_report_structure

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\controller\pandas_decorator.py:4
      1 """This file add the decorator on the DataFrame object."""
      2 from pandas import DataFrame
----> 4 from pandas_profiling.__init__ import ProfileReport
      7 def profile_report(df, **kwargs) -> ProfileReport:
      8     """Profile a DataFrame.
      9 
     10     Args:
   (...)
     15         A ProfileReport of the DataFrame.
     16     """

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\__init__.py:23
     21 from pandas_profiling.config import config
     22 from pandas_profiling.controller import pandas_decorator
---> 23 from pandas_profiling.model.describe import describe as describe_df
     24 from pandas_profiling.report import get_report_structure
     27 class ProfileReport(object):

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\model\describe.py:17
     15 from pandas_profiling import __version__
     16 from pandas_profiling.config import config as config
---> 17 from pandas_profiling.model.messages import (
     18     check_variable_messages,
     19     check_table_messages,
     20     warning_type_date,
     21     check_correlation_messages,
     22 )
     24 from pandas_profiling.model import base
     25 from pandas_profiling.model.base import Variable

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\model\messages.py:12
      8 from dateutil.parser import parse
     10 import numpy as np
---> 12 from pandas_profiling.model.correlations import perform_check_correlation
     13 from pandas_profiling.config import config
     14 from pandas_profiling.model.base import Variable

File c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas_profiling\model\correlations.py:11
      9 import numpy as np
     10 from confuse import NotFoundError
---> 11 from pandas.core.base import DataError
     12 from scipy import stats
     14 from pandas_profiling.config import config

ImportError: cannot import name 'DataError' from 'pandas.core.base' (c:\Users\beste\miniconda3\envs\myenv\Lib\site-packages\pandas\core\base.py)
In [ ]:
## % 'magic' command :  ensures we can create charts directly beneath the cells in the notebook
%matplotlib inline 
In [71]:
import importlib   ### OPTIONAL RELOAD to ensure the package is loaded when we need it
importlib.reload(openpyxl)
importlib.reload(np)
c:\Users\beste\miniconda3\envs\profiling_env\lib\importlib\__init__.py:169: UserWarning: The NumPy module was reloaded (imported a second time). This can in some cases result in small but subtle issues and is discouraged.
  _bootstrap._exec(spec, module)
Out[71]:
<module 'numpy' from 'c:\\Users\\beste\\miniconda3\\envs\\profiling_env\\lib\\site-packages\\numpy\\__init__.py'>
In [2]:
## visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Print the versions of Seaborn and Matplotlib
print("Seaborn version:", sns.__version__)
print("Matplotlib version:", plt.matplotlib.__version__)
Seaborn version: 0.13.2
Matplotlib version: 3.8.4
In [ ]:
### Load the spread sheet 

### Define path for notebook and where the data folder is located
current_dir = os.getcwd()
folder_name= "Data"
data_dir= os.path.join(current_dir,folder_name)

file_name= "Naperville_2021_Fire_Incident_Calls.xlsx"
file_path = os.path.join(data_dir,file_name)

### check if we need to make a data folder
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    print("Folder Created")
else:
    print("Folder already exists")

### Read in the spreadsheet
input_data = pd.read_excel(file_path)
input_data
Folder already exists
Out[ ]:
Object ID X Y State Fire Department ID Fire Department Name Incident Valid Status Incident Release Status Incident Address Type Description Incident Location Number or Milepost ... Number of Other Apparatus Number of Total Apparatus Number of Suppression Personnel Number of EMS Personnel Number of Other Personnel Number of Total Personnel Fire Service Fatalities Non Fire Service Fatalities Fire Service Injuries Non Fire Service Injuries
0 1 -88.181787 41.781629 IL DD132 NAPERVILLE FD Valid Released Street address 636 ... 0 1 0 2 0 2 0 0.0 0 0.0
1 2 -88.208827 41.710117 IL DD132 NAPERVILLE FD Valid Released Street address 3104 ... 0 1 3 0 0 3 0 0.0 0 0.0
2 3 -88.181787 41.781629 IL DD132 NAPERVILLE FD Valid Released Street address 636 ... 0 1 0 2 0 2 0 0.0 0 0.0
3 4 -88.182272 41.773592 IL DD132 NAPERVILLE FD Valid Released Street address 3 ... 0 2 3 2 0 5 0 0.0 0 0.0
4 5 -88.143246 41.794494 IL DD132 NAPERVILLE FD Valid Released Street address 1525 ... 0 2 4 2 0 6 0 0.0 0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
5596 5597 -88.143312 41.785108 IL DD132 NAPERVILLE FD Valid Released Street address 833 ... 0 1 0 2 0 2 0 0.0 0 0.0
5597 5598 -88.147494 41.772234 IL DD132 NAPERVILLE FD Valid Released Street address 232 ... 0 2 6 0 0 6 0 0.0 0 0.0
5598 5599 -88.206781 41.785828 IL DD132 NAPERVILLE FD Valid Released Street address 30W041 ... 0 2 3 2 0 5 0 0.0 0 0.0
5599 5600 -88.202791 41.806041 IL DD132 NAPERVILLE FD Valid Released Street address 2035 ... 0 2 6 0 0 6 0 0.0 0 0.0
5600 5601 -88.146113 41.739914 IL DD132 NAPERVILLE FD Valid Released Street address 1529 ... 0 2 3 2 0 5 0 0.0 0 0.0

5601 rows × 77 columns

In [ ]:
### looks like the coordinates are the X and Y , then we have address data fields, incident times and personnel counts
In [32]:
### get sense of the data shape
input_data.shape
Out[32]:
(5601, 77)
In [ ]:
### 5601 rows, 77 columns
In [ ]:
### Can manually check for missing values 

missing_values = input_data.isnull()
print("Missing Values:", missing_values)
Missing Values:       Object ID      X      Y  State  Fire Department ID  \
0         False  False  False  False               False   
1         False  False  False  False               False   
2         False  False  False  False               False   
3         False  False  False  False               False   
4         False  False  False  False               False   
...         ...    ...    ...    ...                 ...   
5596      False  False  False  False               False   
5597      False  False  False  False               False   
5598      False  False  False  False               False   
5599      False  False  False  False               False   
5600      False  False  False  False               False   

      Fire Department Name  Incident Valid Status  Incident Release Status  \
0                    False                  False                    False   
1                    False                  False                    False   
2                    False                  False                    False   
3                    False                  False                    False   
4                    False                  False                    False   
...                    ...                    ...                      ...   
5596                 False                  False                    False   
5597                 False                  False                    False   
5598                 False                  False                    False   
5599                 False                  False                    False   
5600                 False                  False                    False   

      Incident Address Type Description  Incident Location Number or Milepost  \
0                                 False                                 False   
1                                 False                                 False   
2                                 False                                 False   
3                                 False                                 False   
4                                 False                                 False   
...                                 ...                                   ...   
5596                              False                                 False   
5597                              False                                 False   
5598                              False                                 False   
5599                              False                                 False   
5600                              False                                 False   

      ...  Number of Other Apparatus  Number of Total Apparatus  \
0     ...                      False                      False   
1     ...                      False                      False   
2     ...                      False                      False   
3     ...                      False                      False   
4     ...                      False                      False   
...   ...                        ...                        ...   
5596  ...                      False                      False   
5597  ...                      False                      False   
5598  ...                      False                      False   
5599  ...                      False                      False   
5600  ...                      False                      False   

      Number of Suppression Personnel  Number of EMS Personnel  \
0                               False                    False   
1                               False                    False   
2                               False                    False   
3                               False                    False   
4                               False                    False   
...                               ...                      ...   
5596                            False                    False   
5597                            False                    False   
5598                            False                    False   
5599                            False                    False   
5600                            False                    False   

      Number of Other Personnel  Number of Total Personnel  \
0                         False                      False   
1                         False                      False   
2                         False                      False   
3                         False                      False   
4                         False                      False   
...                         ...                        ...   
5596                      False                      False   
5597                      False                      False   
5598                      False                      False   
5599                      False                      False   
5600                      False                      False   

      Fire Service Fatalities  Non Fire Service Fatalities  \
0                       False                        False   
1                       False                        False   
2                       False                        False   
3                       False                        False   
4                       False                        False   
...                       ...                          ...   
5596                    False                        False   
5597                    False                        False   
5598                    False                        False   
5599                    False                        False   
5600                    False                        False   

      Fire Service Injuries  Non Fire Service Injuries  
0                     False                      False  
1                     False                      False  
2                     False                      False  
3                     False                      False  
4                     False                      False  
...                     ...                        ...  
5596                  False                      False  
5597                  False                      False  
5598                  False                      False  
5599                  False                      False  
5600                  False                      False  

[5601 rows x 77 columns]
In [ ]:
### need to sum those True/False
In [ ]:
### Can manually check for missing values 

missing_values = input_data.isnull().sum().sort_values(ascending=False)
print(missing_values.head(40)) ## display set number of fields
Incident Street Suffix Code               5601
Controlled Date / Time                    5599
Detector Alerted Occupants Description    5586
Hazardous Materials Code                  5573
Hazardous Materials Description           5573
Action Taken 3 Description                5476
Incident Street Prefix Code               3592
Incident Apartment                        3547
Mixed Use Code                            3417
Mixed Use Description                     3417
Action Taken 2 Description                2054
Action Taken 2 Code (National)            2054
Incident Street Type Description           292
Incident Response Time (HH:MM:SS)           51
Property Use Code (National)                49
Property Use Code (Group)                   49
Property Use Code (Category)                49
Property Use Description                    49
Non Fire Service Injuries                    4
Non Fire Service Fatalities                  4
Station                                      1
Response Time (Hours)                        0
Arrival Date / Time                          0
Alarm Date - Hour of Day                     0
Alarm Date - Year                            0
Alarm Date / Time                            0
Incident Duration (HH:MM:SS)                 0
Response Time (Minutes)                      0
Number of Other Apparatus                    0
Fire Service Injuries                        0
Fire Service Fatalities                      0
Number of Total Personnel                    0
Number of Other Personnel                    0
Number of EMS Personnel                      0
Number of Suppression Personnel              0
Number of Total Apparatus                    0
Number of EMS Apparatus                      0
Last Unit Cleared Date / Time                0
Number of Suppression Apparatus              0
Action Taken 1 Code (National)               0
dtype: int64
In [ ]:
### Some fields are missing most of the data e.g., Hazardous Materials Code
In [ ]:
### Can manually check for missing values 

missing_values = input_data.isnull().sum().sum()
print(missing_values) ## sum all missing values across all fields
52037
In [ ]:
### Get more details on the table field names, missing values (null counts) and data types
input_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5601 entries, 0 to 5600
Data columns (total 77 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   Object ID                                5601 non-null   int64         
 1   X                                        5601 non-null   float64       
 2   Y                                        5601 non-null   float64       
 3   State                                    5601 non-null   object        
 4   Fire Department ID                       5601 non-null   object        
 5   Fire Department Name                     5601 non-null   object        
 6   Incident Valid Status                    5601 non-null   object        
 7   Incident Release Status                  5601 non-null   object        
 8   Incident Address Type Description        5601 non-null   object        
 9   Incident Location Number or Milepost     5601 non-null   object        
 10  Incident Street Prefix Code              2009 non-null   object        
 11  Incident Street or Highway Name          5601 non-null   object        
 12  Incident Street Type Description         5309 non-null   object        
 13  Incident Street Suffix Code              0 non-null      float64       
 14  Incident Apartment                       2054 non-null   object        
 15  Incident Address Concat                  5601 non-null   object        
 16  Incident City                            5601 non-null   object        
 17  Incident State Code                      5601 non-null   object        
 18  Incident Full Address                    5601 non-null   object        
 19  Incident Type Description                5601 non-null   object        
 20  Incident Type Code Category Description  5601 non-null   object        
 21  Aid Given or Received Code (National)    5601 non-null   object        
 22  Aid Given or Received Description        5601 non-null   object        
 23  Alarm Date - Month of Year               5601 non-null   object        
 24  Alarm Date - Day of Week                 5601 non-null   object        
 25  Incident District                        5601 non-null   object        
 26  Action Taken 1 Description               5601 non-null   object        
 27  Action Taken 2 Description               3547 non-null   object        
 28  Action Taken 3 Description               125 non-null    object        
 29  Detector Alerted Occupants Description   15 non-null     object        
 30  Hazardous Materials Code                 28 non-null     object        
 31  Hazardous Materials Description          28 non-null     object        
 32  Mixed Use Code                           2184 non-null   object        
 33  Mixed Use Description                    2184 non-null   object        
 34  Property Use Code (National)             5552 non-null   object        
 35  Property Use Description                 5552 non-null   object        
 36  Property Use Code (Category)             5552 non-null   object        
 37  Property Use Code (Group)                5552 non-null   object        
 38  Row ID                                   5601 non-null   int64         
 39  Incident ID                              5601 non-null   int64         
 40  Incident Date                            5601 non-null   datetime64[ns]
 41  Incident Number                          5601 non-null   int64         
 42  Exposure                                 5601 non-null   int64         
 43  Station                                  5600 non-null   float64       
 44  Incident Address Type Code               5601 non-null   int64         
 45  Incident Zip Code                        5601 non-null   int64         
 46  Incident Type Code (National)            5601 non-null   int64         
 47  Incident Type Code (Category)            5601 non-null   int64         
 48  Incident Type Code (Group)               5601 non-null   int64         
 49  Alarm Date / Time                        5601 non-null   datetime64[ns]
 50  Alarm Date - Year                        5601 non-null   int64         
 51  Alarm Date - Hour of Day                 5601 non-null   int64         
 52  Arrival Date / Time                      5601 non-null   datetime64[ns]
 53  Incident Response Time (HH:MM:SS)        5550 non-null   object        
 54  Response Time (Hours)                    5601 non-null   float64       
 55  Response Time (Minutes)                  5601 non-null   int64         
 56  Controlled Date / Time                   2 non-null      datetime64[ns]
 57  Last Unit Cleared Date / Time            5601 non-null   datetime64[ns]
 58  Incident Duration (HH:MM:SS)             5601 non-null   object        
 59  Incident Duration (Hours)                5601 non-null   float64       
 60  Incident Duration (Minutes)              5601 non-null   int64         
 61  Fire Department Shift                    5601 non-null   int64         
 62  Incident Alarms                          5601 non-null   int64         
 63  Action Taken 1 Code (National)           5601 non-null   int64         
 64  Action Taken 2 Code (National)           3547 non-null   float64       
 65  Number of Suppression Apparatus          5601 non-null   int64         
 66  Number of EMS Apparatus                  5601 non-null   int64         
 67  Number of Other Apparatus                5601 non-null   int64         
 68  Number of Total Apparatus                5601 non-null   int64         
 69  Number of Suppression Personnel          5601 non-null   int64         
 70  Number of EMS Personnel                  5601 non-null   int64         
 71  Number of Other Personnel                5601 non-null   int64         
 72  Number of Total Personnel                5601 non-null   int64         
 73  Fire Service Fatalities                  5601 non-null   int64         
 74  Non Fire Service Fatalities              5597 non-null   float64       
 75  Fire Service Injuries                    5601 non-null   int64         
 76  Non Fire Service Injuries                5597 non-null   float64       
dtypes: datetime64[ns](5), float64(9), int64(27), object(36)
memory usage: 3.3+ MB
In [ ]:
### we do have some columns with missing values, like incident street suffix code where all are missing (i.e., 0 non-null)
In [ ]:
## We can get a sense of the counts within any variable manually
input_data["Incident Type Description"].value_counts(normalize=True)*100
Out[ ]:
EMS call, excluding vehicle accident with injury      63.452955
Alarm system sounded, no fire - unintentional          7.302267
Assist invalid                                         6.177468
Alarm system sounded due to malfunction                3.981432
Smoke detector activation, no fire - unintentional     3.570791
                                                        ...    
Chemical spill or leak                                 0.017854
Vehicle accident, general cleanup                      0.017854
Person in distress, other                              0.017854
Overpressure rupture from steam, other                 0.017854
Unauthorized burning                                   0.017854
Name: Incident Type Description, Length: 91, dtype: float64
In [117]:
input_data["Incident Type Description"].describe()
Out[117]:
count                                                 5601
unique                                                  91
top       EMS call, excluding vehicle accident with injury
freq                                                  3554
Name: Incident Type Description, dtype: object
In [ ]:
### We can summarize any variable but it would take a long time to do it thoroughly for all of them 

Pandas Profiling

  • a low-code library that allows us to perform automated exploratory data analysis
  • By typing one line of code we produce an entire report of useful statistical and visual analyses
  • The output is a very useful and organized HTML file that we can visualize directly in the notebook or share
In [8]:
### Generate report of table summary and stats

table_report = ProfileReport(input_data, title= 'Fire Incident Report')
In [9]:
table_report
Out[9]:

In [ ]:
### Took 1 minute

### The report is organized in 6 sections that you can click through, including Overview, Alerts, Variables, Missing Values, and Sample

# Overview: shows the number of variables, missing values, duplicate rows, how many numeric and categorical data types
    # *Tip*: Click on More Details
        # Very useful summaries for each variable (column)
    # Alerts tab summarizes the behavior of the variables and overall data issues (e.g., missing values) 
        # High-cardinality - too many categories within one variable , makes grouping analyses more and visualizations more difficult
            # Difficult to interpret patterns, trends, and more costly/time-consuming to train ML models        
        # Imbalance  - too many of the same category within a variable, imbalanced data can skew analyses, creates models that favor that category
            # distorts insights in favor of the dominant category, and creates biased ML models, use special techniques to weigh or undersample
In [ ]:
### Key point: to systematically understand your data most efficiently this is a very solid approach
In [ ]:
### Quick Insight from hexbin plot: A common pattern found in the data is 5 personnel and 5 minutes response time

Finding Interactions in the Data

The interaction report in the ProfileReport result from pandas-profiling provides a visual representation of the relationship between two numerical variables.

Example Interaction

  • "number of total personnel" (y-axis) and "response time in minutes" (x-axis): image.png

Hexbin Plot: a 2-D histogram with hexagonal bins. You feed it two continuous variables (x, y) or hybrid of discrete and continuous, and it cuts the plane into a honeycomb of hexagons, counts how many points fall into each cell, then colors each hexagon by that count (each hexagon is simply counting points)

  1. Hexagonal Binning:

    • The blue-hued hexagons represent bins where data points are grouped based on their values for the two variables.
    • Darker hues indicate higher densities of data points within a bin.
  2. Interpretation of the Darkest Hue:

    • The darkest hue at 5 minutes on both axes suggests that the majority of incidents involve 5 personnel and a response time of 5 minutes.
    • This could indicate a common operational pattern or standard response setup.
  3. Insights:

    • If the hexagons are concentrated around specific values (e.g., 5 minutes and 5 personnel), it suggests a strong clustering of data points at those values.
    • If the hexagons are spread out, it indicates variability in the relationship between the two variables.
  4. Use Case:

    • This visualization helps identify correlations, clusters, or anomalies in the data.
    • For example, if there are outliers (e.g., very high response times with few personnel), they would appear as isolated hexagons
In [ ]:
### Scatter plots also compare two variables but are less clear in this instance
        ### But what happens when we have lots of repeating values?


sns.scatterplot(x="Response Time (Minutes)", y="Number of Total Personnel", data=input_data)
plt.title('Scatter Plot: Response Time vs Total Personnel')
plt.xlabel('Response Time (minutes)')
plt.ylabel('Total Personnel')
plt.show()
Seaborn version: 0.12.2
Matplotlib version: 3.6.3
In [ ]:
### scatter plots all instances , even if they overlap , they will be plotted in same location
In [109]:
### Let's adjust transpanency
sns.scatterplot(x="Response Time (Minutes)", y="Number of Total Personnel", data=input_data, 
    alpha=0.1 # Adjust transparency to make areas with overlapping points darker
)
plt.title('Scatter Plot: Response Time vs Total Personnel')
plt.xlabel('Response Time (minutes)')
plt.ylabel('Total Personnel')
plt.show()
In [ ]:
# get the top 10 categories by their counts
top_10_incident_categories= input_data['Incident Type Description'].value_counts().nlargest(10).index
print(top_10_incident_categories, sep='\n')

# filter to only include row with top 10 categories
filtered_data  =  input_data[input_data["Incident Type Description"].isin(top_10_incident_categories)]

### Let's add some data and color by indicent description
sns.scatterplot(x="Response Time (Minutes)", y="Number of Total Personnel", data=filtered_data, 
    alpha=0.3 # Adjust transparency to make areas with overlapping points darker
    , hue='Incident Type Description' # color points by incident type
    , size= 'Incident Type Description'  # size points by frequency of incident
)
# plt.figure(figsize=(12,16))
plt.title('Scatter Plot: Response Time vs Total Personnel')
plt.xlabel('Response Time (minutes)')
plt.ylabel('Total Personnel')
        # bbox: anchor point with offset (horizontal, vertical)  , loc= point on legend to anchor,borderaxespad= padding between legend and plot
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left',  borderaxespad=0.1)
# plt.tight_layout()
plt.show()
Index(['EMS call, excluding vehicle accident with injury',
       'Alarm system sounded, no fire - unintentional', 'Assist invalid',
       'Alarm system sounded due to malfunction',
       'Smoke detector activation, no fire - unintentional',
       'Defective elevator, no occupants',
       'Smoke detector activation due to malfunction',
       'False alarm or false call, other', 'Gas leak (natural gas or LPG)',
       'Detector activation, no fire - unintentional'],
      dtype='object')
In [ ]:
### we can see the incident type is impacting the number of personnel more than the response time
In [ ]:
### scatter plots with added visuals
    #  we start to see some repeating patterns of these two variables
    # adding category label as coloring not too useful in this case
    #  data is clustered with a lot of points falling on the bottom left around 5 total personnel and 5 minutes
In [ ]:
### Not too helpful. Let's try a hexbin plot
# Hexbin plots are useful for visualizing the density of data points in two dimensions.
In [38]:
sns.set_theme(style="whitegrid")  # Apply Seaborn styling

# Create a hexbin plot using Matplotlib
                # x-axis                            # y-axis                         # gridsize= resolution or detail of the plot
plt.hexbin(input_data["Response Time (Minutes)"], input_data["Number of Total Personnel"], gridsize=30, cmap="inferno", mincnt=1)
plt.colorbar(label="Density")
plt.xlabel("Response Time (Minutes)")
plt.ylabel("Number of Total Personnel")
plt.title("Hexbin Plot: Response Time vs Total Personnel")
plt.show()
In [ ]:
### this plot nicely identifies a pattern of personnel and response time
    # Density tells us the number of points that fall into a bin
    # we can adjust the grid size i.e., number of hexagon bins used to resolve patterns 
    # if there are subtle difference may need to increase but 30 is a good starting point
 
In [ ]:
sns.set_theme(style="whitegrid")  # Apply consistent style, white background w/ gridlines improves readability and consistency

# Create a hexbin plot using Matplotlib
                # x-axis                            # y-axis                         # gridsize= resolution or detail of the plot
plt.hexbin(input_data["Response Time (Minutes)"], input_data["Number of Total Personnel"], gridsize=10, cmap="inferno", mincnt=1)
plt.colorbar(label="Density")
plt.xlabel("Response Time (Minutes)")
plt.ylabel("Number of Total Personnel")
plt.title("Hexbin Plot: Response Time vs Total Personnel")
plt.show()
In [ ]:
### too coarse a resolution (only 10 bins), but we can still see that 5 personnel is very common
    # less clear is the co-occurring response time
In [72]:
# Create a hexbin plot using Matplotlib
                # x-axis                            # y-axis                         # gridsize= resolution or detail of the plot ## aggregate y values, color by the mean y value
plt.hexbin(input_data["Response Time (Minutes)"], input_data["Number of Total Personnel"], gridsize=30, cmap="inferno", mincnt=1, reduce_C_function=np.mean)
plt.colorbar(label="Density")
plt.xlabel("Response Time (Minutes)")
plt.ylabel("Number of Total Personnel")
plt.title("Hexbin Plot: Response Time vs Total Personnel")
plt.show()
In [ ]:
### Explanation:
    # Each hexagon in the plot corresponds to a bin that groups data points based on their x-axis (Response Time) and y-axis (Total Personnel) values.
    # The density bar shows the range of counts for these bins:
    # 0: Indicates bins with no data points.
    # 800+: Indicates bins with the highest number of data points.

### Interpretation:
    # The darker the hexagon, the higher the density of data points in that bin.
    # For example, if a hexagon corresponds to a density of 800, it means that 800 data points fall within the range defined by that hexagon's boundaries.
In [ ]:
### Plot the frequency of points for these variables individually to confirm the pattern


# Calculate mean and median
mean_value = input_data["Response Time (Minutes)"].mean()
median_value = input_data["Response Time (Minutes)"].median()

# Histogram for response time
sns.histplot(input_data["Response Time (Minutes)"])
plt.title('Histogram: Response Time')
plt.xlabel('Response Time (minutes)')
plt.ylabel('Frequency')

# Add vertical lines for mean and median
plt.axvline(mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
plt.axvline(median_value, color='blue', linestyle='--', label=f'Median: {median_value:.2f}')
# Add legend
plt.legend()
plt.show()


# Calculate mean and median
mean_value = input_data["Number of Total Personnel"].mean()
median_value = input_data["Number of Total Personnel"].median()

# Histogram for total personnel
sns.histplot(input_data["Number of Total Personnel"])
plt.title('Histogram: Total Personnel')
plt.xlabel('Total Personnel')
plt.ylabel('Frequency')
# Add vertical lines for mean and median
plt.axvline(mean_value, color='red', linestyle='--', label=f'Mean: {mean_value:.2f}')
plt.axvline(median_value, color='blue', linestyle='--', label=f'Median: {median_value:.2f}')

# Add legend
plt.legend()
plt.show()
In [126]:
# Select only numerical columns
numerical_data = input_data.select_dtypes(include=[np.number])

# Compute the correlation matrix
correlation_matrix = numerical_data.corr()

# Mask the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm",annot=False,  mask=mask ,fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Variables")
plt.show()
In [ ]:
### When values are all the same for a variable we will get blanks or white cells in the correlation matrix
    # better to remove values that have constant values since the correlation will not produce any result
In [128]:
# Select only numerical columns
numerical_data = input_data.select_dtypes(include=[np.number])

# Drop columns with constant values (no variability)
numerical_data = numerical_data.loc[:, numerical_data.nunique() > 1]

# Compute the correlation matrix
correlation_matrix = numerical_data.corr()

# Mask the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, mask=mask, fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Numerical Variables")
plt.show()
In [ ]:
### Correlation Heatmap :useful visual to help isolate relationships and investigate those further
    # Interesting patterns emerge between action taken codes and incident type codes 
        # another pattern shows a negative relationship between incident response times and incident codes
        # provides a lead to follow-up on
    # Some relationships may seem obvious but even those help confirm our data exploration is tracking with the real-world

links

social