Bhautik Radiya

A Comprehensive Guide to Data Handling and Manipulation with Pandas

Pandas data handling

Discover the ultimate guide to working with datasets using Python’s Pandas library. From reading files, renaming columns, handling missing data, and performing group by operations to saving files in different formats, this article covers all the essential techniques for efficient data manipulation and analysis.

# CVS #
df=pd.read_csv("D:/data.csv")

# Excel #
df=pd.read_excel("D:/data.xlsx")

# Json # 
df=pd.read_json("D:/data.json",lines=True)

# Zip File #
df=pd.read_csv("D:/data zip file.zip",compression='zip')

## *** Different Encoded File *** ##
encodings_to_try = ['utf-8', 'Latin-1', 'ISO-8859-1']
for encoding in encodings_to_try:
    try:
        df = pd.read_csv('D:/data.csv', encoding=encoding)
        print("File read successfully with encoding:", encoding)
        break
    except UnicodeDecodeError:
        print('Sorry')

df.head()

Display all the rows or columns

# Display All The Rows
df=pd.read_csv("D:/data.csv")
pd.set_option('display.max_rows', None) 
df.head()

# Display all the columns
df=pd.read_csv("D:/data.csv")
pd.set_option('display.max_columns', None)
df


# Display all the rows and columns
df=pd.read_csv("D:/data.csv")
pd.set_option('display.max_rows', None)  
pd.set_option('display.max_columns', None) 
df

Rename columns

df.columns = df.columns.str.strip()
df.rename(columns={'OldName1': 'NewName1', 'OldName2': 'NewName2', 'OldName3': 'NewName3'}, inplace=True)
df.head()

Get all the numerical Column

# Get all the numerical column name
numerical_columns = df.select_dtypes(include=['int64','int32','float64','float32']).columns.tolist()
numerical_columns

# Get all the numerical column as dataframe
numerical_df = df[df.select_dtypes(include=['int64','int32','float64','float32']).columns.tolist()]
numerical_df.head()
# Get all the numerical column name
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_columns

# Get all the numerical column as dataframe
categorical_df = df[df.select_dtypes(include=['object']).columns.tolist()]
categorical_df.head()

Take Subset from the dataframe

# Change 30000 with your desired number of rows
df=df.sample(30000,random_state=2)
df = df.reset_index(drop=True)

Shuffle the dataset

df = df.sample(frac=1).reset_index(drop=True)

Set new column as index

# Setting single column  as the index
df.set_index('Column_name', inplace=True)

# Setting multiple columns  as the index
df.set_index(['Col1', 'Col2'], inplace=True,drop=True)

Reset Index of Dataframe

df.reset_index(inplace=True)

Display Dataframe

# Display top 1-50 rows
df.head()

# Display  1-50 rows from the end
df.tail()

# Display any 1-50 rows
df.sample()

DataFrame shape

df.shape

DataFrame Information

df.info()

DatFrame Description

df.describe()

Data type of column

# Single Column
df['column_name'].dtypes

# All the column
df['Column_name'].dtypes

All the column names

df.columns

Unique Values of a column

df['Column_name'].unique()

Larget and smallest valus of a column

# Change the number in the bracket
largest=df["Column_name"].nlargest(6)

# Change the number in the bracket 
smallest=df["Column_name"].nsmallest(10)

Find duplicate and drop duplicate values

# Get duplicate
duplicate=df['Column_name'].duplicated().sum()

# Drop Duplicate
drop_duplicate=df['Column_name'].drop_duplicates()

Count of unique values of a column

# Count of unique values of a column
pd.set_option('display.max_rows', None)
count_value=df['Column_name'].value_counts()

# Count of unique values of a column in percentage
pd.set_option('display.max_rows', None)
df['Column_name'].value_counts(normalize=True)* 100

Change the Data type of the column

# Float
df['Column_name'] = df['Column_name'].astype(float)

# Object
df['Column_name'] = df['Column_name'].astype('category')

# int
df['Column_name'] = df['Column_name'].astype(int)

#Datetime
df['Date_col_name'] = pd.to_datetime(df['Date_col_name'])

# Also can specify the format
df['Date_col_name'] = pd.to_datetime(df['Date_col_name'], format='%Y-%m-%d')

Drop columns and rows

# Drop the single column
df = df.drop('Column_name', axis=1, inplace=True)

# Drop multiple columns
df = df.drop(['Column_name1','Column_name2'], axis=1, inplace=True)

# Drop a single row
df = df.drop(row_number, axis=0, inplace=True)
Ex---> df = df.drop(4, axis=0, inplace=True)

# Drop multiple rows
df = df.drop([row_number1,row_number2,row_number3...], axis=0, inplace=True)
Ex--> df = df.drop([3,6,11], axis=0, inplace=True)
    
# Drop rows of the dataframe using column values
df = df[df['Column_name'] != 'value']
Ex---> df = df[df['Brand'] != 'Others']

Column and row selection

# Single Column
single_column = df['ColumnName']

# Select multiple columns
multiple_columns = df[['Column1', 'Column2']]

# Single Row
single_row=df.loc[[row_number]]
Ex---> single_row=df.loc[[2]]

# Multiple 
multiple_rows_iloc = df.loc[[row_number,row_number,row_number..]]
Ex---> multiple_rows_iloc = df.loc[[2,4,5]]

# Take only specific characters of a column
df['Column_name'] = df['Column_name'].astype(str).str[:number]
Ex---> df['Year'] = df['Year'].astype(str).str[:4]

# Select Range of rows
df1=df.loc[num:num]
Ex--> df1=df.loc[5:8]
Ex--> df1=df.loc[:8]
Ex--> df1=df.loc[8:]

# Singe value of Single column 
singe_value_of_Single column=df.loc[row_number,"Column_name"]
Ex---> singe_value_of_Single column=df.loc[4,"Airline"]

# Single Column multiple rows
single_column_multiple_rows=df.loc[num:num,"Column_name"]
Ex---> single_column_multiple_rows=df.loc[1:3,"Airline"]

# Multiple row multiple columns
multiple_row_multiple_columns=df.loc[num:num,["Column_name1","Column_name2"]]
Ex---> multiple_row_multiple_columns=df.loc[1:3,["Name","Grade","Ranking"]]

# Using condition to select
df1=df.loc[df["Airline"]== "IndiGo"]
df1=df.loc[df["Id"]< 4]


# Get last rows
last_10_rows_index = df.index[-10:]
last_10 = df.loc[last_10_rows_index]

Filter DataFrame

'''
&--> AND operator
|--> OR operator
'''
mask1=df['Column_name']=="Value"
df[mask1]

Ex:1---> 
mask1=df['CarName']=="BMW"
df[mask1]

Ex:2--->
mask1=df['CarName']=="BMW"
mask2=df['FuleType']=="Gas"
df=df[mask1 & mask2]
df.head()

Ex:3--->
mask1=df['CarName']=="BMW"
mask2=df['FuleType']=="Gas"
mask3=df['Transmission']=="Automatic"
df=df[mask1 & (mask2 | mask3)]
df.head()

Ex:4--->
mask1=df['CarName']=="BMW"
mask2=df['Price']<=40000
df=df.loc[mask1 & mask2,["Total marks","Grade","Ranking"]]
df.head()

Ex:5--->
mask1=df['CarName']=="BMW"
mask2=df['Price'].between(40000,754000)
df=df.loc[mask1 & mask2,["Grade","Ranking"]]
df.head()

Splitting column

# Below comma(,) used as separator. Change it according to  your need
df[['First', 'Second']] = df['Column'].str.split(',', expand=True)

# Using regular expression
import re
df[['First', 'Second']] = df['Column'].str.extract(r'ID: (\d+) Name: (\w+)')

Merge, Concatinate and Join column

# Merge side by side
df=pd.merge(df1,df2,right_index=True,left_index=True)

# Concatinate >> stack type
df=pd.concat([df1,df2],ignore_index=True)

# Join side by side
df=df1.join(df2)

Sort dataframe

# Ascending order
df=df.sort_values(by="column_name",ascending=True)

# Descending order
df=df.sort_values(by="column_name",ascending=False)

# Sort using multiple column
df= df.sort_values(by=['Column1', 'Column2'], ascending=False)

# Sort using index number
df =df.sort_index(ascending=False)

Create and add new column (DataFrame)

list=[]
for i in df['ColName']:
    list.append(i)
df['New_Column_name']=list

Replace a column

data = {'A': [1, 2, 3], 'B': ['x', 'y', 'z']}
df = pd.DataFrame(data)

new_data = ['p', 'q', 'r']

df['B'] = pd.Series(new_data)
df
x=[1,2,3]
df=pd.DataFrame(x)
df


x={"id":[1,2,3],"Name":["R","A","F"],"Ratting":[4.00,3.50,3.45]}
df=pd.DataFrame(x)
df


x=[{"id":1,"Name":"A","Ratting":3.0},{"id":2,"Name":"B","Ratting":5.0}]
df=pd.DataFrame(x)
df

Create series

x=[1,"data frame","python","data science"]
df=pd.Series(x)
df

x=[1,2,3]
df=pd.Series(x,dtype=float)
df

Check missing values

# For single column in percentage
df['Airline'].isnull().sum()/df.shape[0]*100

# For all columns in percentage
df.isnull().sum()/df.shape[0]*100

# For all column
df.isnull().sum()

Dropna function

axis: This parameter determines whether to drop rows or columns. By default, it’s set to 0 (rows). Setting it to 1 will drop columns with NaN values.

how: It specifies the criteria used to drop the NaN values. The options are:

‘any’: Drops the row or column if any NaN value is present (default). ‘all’: Drops the row or column only if all values are NaN. thresh: It requires a minimum number of non-NaN values. For example, thresh=2 will drop rows or columns that have less than two non-NaN values.

df=df.dropna()

df=df.dropna(axis=1)

df=df.dropna(how="all")

df=df.dropna(thresh=4,axis=0)

df=df.dropna(how="all")

Fillna function

# Fill using a fixed values
df=df.fillna("FFFFF")
df['column_name']=df['column_name'].fillna("FFFFF")

# Fill using forward cell values of the missing cell
df=df.fillna(method="ffill")
df['column_name']=df['column_name'].fillna(method="ffill")

# Fill using previous cell values of the missing cell
df=df.fillna(method="bfill")
df['column_name']=df['column_name'].fillna(method="bfill")

Replace Values

# Replace multiple
df['Column_Name'] = df['Column_Name'].str.replace('₹', '').str.replace(',', '')

##### Code to take only text not symbol ###########
import re
df['Column_Name'] = df['Column_Name'].str.replace(r'[^a-zA-Z\s]', '')


# Replace single
df['Column_Name'] = df['Column_Name'].str.replace(',', '')

# Replace multiple values using dictionary
replace_dict = {'A': 'Apple', 'B': 'Banana', 'C': 'Cherry'}
df['column_name'].replace(replace_dict, inplace=True)


###################################################
'''
The values of the list of  will not be replaced and other 
values will be replaced of the selected column
'''
given_car_brands = [
    'Toyota', 'Hyundai', 'Mazda', 'Holden', 'Ford', 'Mitsubishi', 'Nissan',
    'Volkswagen', 'Kia', 'Mercedes-Benz', 'Subaru', 'BMW', 'Audi', 'Honda',
    'Jeep', 'Suzuki', 'MG', 'Land', 'Isuzu', 'GWM', 'Lexus', 'Renault',
    'Volvo', 'LDV', 'Skoda'
]

df['Column_Name'] = df['Column_Name'].apply(lambda x: x if x in given_car_brands else 'Others')
# Here other is that value which we want to replace

Clean numerical and categorial value

# For numerical column take only numerical values and remove other character and symbol
import re
df['Column_name'] = df['Column_name'].str.replace(r'\D', '', regex=True)

# For Categorical column take only character and number values and remove other  symbol
df['Column_name'] = df['Column_name'].str.replace(r'[^a-zA-Z0-9]', '', regex=True)

KNN imputer and simple imputer

  1. If your data frame contains only numerical columns then select the first code.
  2. If your data frame contains only a categorical column then select the second code.
  3. If your data frame contains both categorical & numerical columns then select the third code.
### ************ For all numerical column ************** ######
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)  
df_imputed = imputer.fit_transform(df)  
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)



### ********** For all categorical column ************ #########
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)



### ********** For both categorical and numerical columns ********** ###########
numerical_columns = df.select_dtypes(include=['number']).columns.tolist()
numerical_df=df[numerical_columns]
knnimputer = KNNImputer(n_neighbors=5)  
numerical_cln_df = knnimputer.fit_transform(numerical_df)  
numerical_cln_df = pd.DataFrame(numerical_cln_df, columns=numerical_df.columns)

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
categorical_df = df[categorical_columns]
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_cln_df = pd.DataFrame(categorical_imputer.fit_transform(categorical_df), columns=categorical_df.columns)

df=pd.merge(categorical_cln_df,numerical_cln_df,right_index=True,left_index=True)

Save file different format

df.to_excel("new_file_name.xlsx")
df.to_csv("new_file_name.csv")
df.to_csv("new_file_name.json")

Groupby

# *****Single Column********
g1 =df.groupby(['column_name'])['group_val_column_name'].mean().reset_index()
g1=g1.sort_index(ascending=False)
g1

Ex:---->
g1 =df.groupby(['Car'])['Price'].mean().reset_index()
g1=g1.sort_index(ascending=False)
g1

# ******** Multiple Column **********
g1 =df.groupby(['column_name','column_name'...])['group_val_column_name'].sum().reset_index()
g1=g1.sort_index(ascending=False)
g1

Ex:--->
g1 =df.groupby(['car','transmission'])['price'].sum().reset_index()
g1=g1.sort_index(ascending=False)
g1

Aggregate

result1 = df.groupby('Column_name')['Value_column_name'].agg(['mean', 'max', 'min']).reset_index()

result1 = df.groupby('car')['Price'].agg(['mean', 'max', 'min']).reset_index()

Sharing is caring!

0 0 votes
Article Rating
Subscribe
Notify of
guest
1 Comment
Oldest
Newest Most Voted
Inline Feedbacks
View all comments
bestiptv-smarters

Hey there You have done a fantastic job I will certainly digg it and personally recommend to my friends Im confident theyll be benefited from this site