04_Making real copies of Dataframes in pandas¶
In [ ]:
import pandas as pd
In [ ]:
# Let's start with simple example of appending to a list
listA = [1,2,3,4]
listB = [1,2,3,4]
listA.append(5) # we modify lists, using listName.append(value)
print(listA)
print(listB)
# now we have added a value to our list
In [ ]:
# we can copy a list by setting it equal to a new variable
# ...neat
# but what will happen if you add to one of these lists?
list1 = [1,2,3,4]
list2 = list1
list2.append(5) # Modifying list2 also modifies list1 because they reference the same list
print(list1)
print(list2)
# DataFrame in pandas
# A DataFrame can be created from a dictionary of lists
# Each key-value pair in the dictionary corresponds to a column in the DataFrame
# The key is the column label and the list is the column data
data = {
'mag': [1,1,3,3], # column 1
'date': ['April 10 2024','April 11 2024','April 12 2024','April 13 2024'], # column 2
'inj' : [1,2,100,200] # column 3
}
tdata = pd.DataFrame(data) #convert to a pandas Dataframe
print(tdata) # lets have a look before changing anything
In [ ]:
# A DataFrame is like a dictionary of lists, but with additional functionality
# Be careful when modifying DataFrames OR lists
# If you create a new DataFrame OR list by referencing an existing one, they will both point to the same data
cleaned_tdata = tdata
# So if you modify one, the other will also be modified
cleaned_tdata.drop('mag', axis=1, inplace=True)
print(tdata)
print(cleaned_tdata)
# we dropped mag in the cleaned tdata
# But that change was also applied to the tdata!
In [ ]:
# Lets try dropping a single row, again notice how this change carries over to the data we 'copied'?
data = {
'mag': [1,1,3,3], # column 1
'date': ['April 10 2024','April 11 2024','April 12 2024','April 13 2024'], # column 2
'inj' : [1,2,100,200] # column 3
}
tdata = pd.DataFrame(data)
# We create a 'copy' of the data that we want to work on or clean
cleaned_tdata = tdata
# Drop the specified rows from the DataFrame in-place
# modify one, the other will also be modified
cleaned_tdata.drop(cleaned_tdata[cleaned_tdata['mag'] == 1].index, inplace=True)
print(tdata)
print(cleaned_tdata)
In [ ]:
# To avoid this, create a real copy of the data
data = {
'mag': [1,1,3,3], # column 1
'date': ['April 10 2024','April 11 2024','April 12 2024','April 13 2024'], # column 2
'inj' : [1,2,100,200] # column 3
}
tdata = pd.DataFrame(data)
# create a REAL copy of the data to work on or clean
cleaned_tdata = tdata.copy() # creates a real copy of the data
# Drop the specified rows from the DataFrame in-place
cleaned_tdata.drop(cleaned_tdata[cleaned_tdata['mag'] == 1].index, inplace=True)
print(tdata) # The original data is unchanged
print(cleaned_tdata)
In [ ]:
# Did we really mean to also modify the original tdata?...
# Why did this happen?
# we dropped rows from cleaned_tdata 'in-place', which modified the original data for cleaned_tdata
# since tdata and cleand_tdata both point to the same DataFrame
# modifying the orignal data frame (i.e., using inplace=True) also affects tdata
# To avoid changing/losing the original data, best to create a REAL copy using .copy() BEFORE making any changes to the data
In [ ]:
# use the tuple as a multi-index in the DataFrame.
# This allows you to index the DataFrame using the values in multiple columns
# Create a DataFrame from a dictionary of tuples
data = {
'mag': (1,1,3,3),
'date': ('April 10 2024','April 11 2024','April 12 2024','April 13 2024'),
'inj' : (1,2,100,200)
}
tdata = pd.DataFrame(data)
# Set 'mag' and 'date' as a multi-index
tdata.set_index(['mag', 'date'], inplace=True)
new_tdata= tdata
print(tdata)
print(new_tdata)
# Access data for a specific 'mag' and 'date'
print(tdata.loc[(1, 'April 11 2024')])
# Access data for a specific 'mag' and 'date'
print(new_tdata.loc[(1, 'April 11 2024')])
# Try to change the 'mag' value for a specific row in the index
try:
tdata.index[0] = (2, 'April 11 2024')
except TypeError as e:
print(f"TypeError: {e}")
# Try to change the 'date' value for a specific row in the index
try:
tdata.index[0] = (1, 'April 12 2024')
except TypeError as e:
print(f"TypeError: {e}")
In [ ]:
# To change the mag or date values for a specific row
# need to create a new tuple, which means creating a new dataframe
# .resetindex() Creates NEW DataFrame, applied on tdata will remove the multi-index
# reverts tdata back to default integer index, with mag and date as columns
tdata_copy = tdata.reset_index() # tdata_copy is a separate object and does not reference the same data
print(tdata) # original data is maintained
print(tdata_copy) # a new separate dataframe is created
# Now that mag and date are not a tuple, we can change the 'mag' and 'date' values
tdata_copy.loc[0, 'mag'] = 5 # changes mag in the first row
tdata_copy.loc[0, 'date'] = 'June 1 2024' # changes date in the first row
# Set the new multi-index
tdata_copy.set_index(['mag', 'date'], inplace=True)
print(tdata)
print(tdata_copy) # even though it is derived from tdata, it a separate object and changes to one will not affect the other
In [ ]:
# In this example, we're setting 'mag' and 'date' as a multi-index for the DataFrame
# This allows us to index the DataFrame using the values in both 'mag' and 'date'. The tuple of ('mag', 'date')
# acts as a single key that we can use to index the DataFrame
# This is a case where the immutability of tuples is relevant to the analysis of data in a DataFrame.
# Similar to how in functional programming,
# the immutability of data structures ensures that their identity remains constant even as new data structures are created based on them
In [ ]:
# Say we find that we often need to change the data in our dataframe, maybe dates and mags have typos..
# We need a function to simplify this re-occuring process
# but our dataframe has a tuple multi index
# So, our function should take a dataframe, and a tuple, then return a NEW dataframe with the updated index
def set_new_index(df, row, new_index_values):
print(df) #input data we want to change
# Create a new DataFrame with the desired index
df_copy = df.reset_index() # reset the index and create a NEW dataframe
print(df_copy) # check the results of the NEW dataframe
df_copy.loc[row, ['mag', 'date']] = new_index_values # change the mag and date for the first row of the NEW dataframe
print(df_copy)
df_copy.set_index(['mag', 'date'], inplace=True) # set the index to the tuple of mag, date for the NEW dataframe
print(df_copy)
return df_copy # return the NEW dataframe with the modified index
# This pure function , it doesn't modify the original DataFrame (it has no side effects)
# will always product the same output Dataframe for the same input Dataframe,index (we have no external variables in the function modifying our output)
new_tdata = set_new_index(tdata, 0, (2, 'April 12 2024')) # creates a new DataFrame with the multi index (tuple) updated
print(new_tdata)
# This function doesn't modify the original DataFrame (no side effects)
In [ ]: