import pandas as pd
import numpy as np
import re
Load the raw data
# Load the data
= pd.read_csv("../data/raw/indigenous-business/bcindigenousbusinesslistings.csv") df
Inspecting the data
# Inspect the data
print(df.info())
print(df.head())
print(f"Initial number of rows: {len(df)}")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Business Name 1259 non-null object
1 Description 1135 non-null object
2 Web Site 699 non-null object
3 City 1258 non-null object
4 Latitude 1258 non-null float64
5 Longitude 1258 non-null float64
6 Keywords 1257 non-null object
7 Region 1259 non-null object
8 Type 1123 non-null object
9 Industry Sector 1222 non-null object
10 Year Formed 648 non-null float64
11 Number of Employees 572 non-null object
dtypes: float64(3), object(9)
memory usage: 118.2+ KB
None
Business Name \
0 Ellipsis Energy Inc
1 Indigenous Community Development & Prosperity ...
2 Formline Construction Ltd.
3 Quilakwa Investments Ltd.
4 Quilakwa Esso
Description \
0 Ellipsis Energy Inc is an Aboriginal owned com...
1 ICDPRO works together with Indigenous communit...
2 With over combined 30 years of experience in t...
3 Quilakwa Investments Ltd. oversees several Ind...
4 Quilakwa Esso is owned by the Splatsin Indian ...
Web Site City Latitude \
0 http://www.ellipsisenergy.ca Moberly Lake 55.819370
1 https://indigenouscommunitydevelopment.com/ Enderby 50.551498
2 https://www.flcon.ca/ Burnaby 49.266050
3 http://www.splatsindc.com Enderby 50.537507
4 NaN Enderby 50.537507
Longitude Keywords \
0 -121.834602 Ellipsis Energy Inc 21 – Mining, quarrying, an...
1 -119.133546 Indigenous Community Development & Prosperity ...
2 123.005840 Formline Construction Ltd. 23 – Construction
3 -119.141955 Quilakwa Investments Ltd.
4 -119.141955 Quilakwa Esso 44-45 - Retail trade
Region Type \
0 Northeast Private Company
1 Thompson / Okanagan Private Company
2 Lower Mainland / Southwest Private Company
3 Thompson / Okanagan Community Owned Company
4 Thompson / Okanagan Community Owned Company
Industry Sector Year Formed \
0 21 – Mining, quarrying, and oil and gas extrac... 2012.0
1 81 – Other services (except public administrat... 2020.0
2 23 – Construction 2021.0
3 72 – Accommodation and food services 1984.0
4 44-45 - Retail trade 1984.0
Number of Employees
0 5 to 9
1 1 to 4
2 1 to 4
3 20 to 49
4 10 to 19
Initial number of rows: 1259
Column Name Standardization
# Clean column names (convert to lowercase and replace spaces with underscores)
= df.columns.str.strip().str.lower().str.replace(' ', '_') df.columns
Remove Unnecessary Columns
# Remove unnecessary columns
= ['description', 'web_site', 'keywords']
columns_to_drop = df.drop(columns=columns_to_drop, errors='ignore') df
Removal of Duplicates
# Remove duplicate rows
= df.drop_duplicates() df
# check no of rows after removing duplicates
print(f"No of rows after removing duplicates: {len(df)}")
No of rows after removing duplicates: 1259
Critical Data Validation
# Remove rows missing critical information
#business_name is a mandatory field here
if 'business_name' in df.columns:
= df[df['business_name'].notna() & (df['business_name'] != '')] df
# check no of rows after removing rows missing critical information
print(f"No of rows: {len(df)}")
No of rows: 1259
Ensure Year is an integer
# Ensure year_formed is a nullable integer
'year_formed'] = pd.to_numeric(df['year_formed'], errors='coerce').astype('Int64') df[
Cleanup industry_sector
# custom function to standardize industry_sector data
def clean_industry_sector(sector):
if pd.isna(sector):
return np.nan
# Convert to string
= str(sector).strip()
sector
# Handle cases starting with colon
if sector.startswith(':'):
= sector[1:].strip()
sector
# Remove ALL number patterns including:
# "23 - ", "44-45 - ", "1.5 - ", "54 – " (with en dash)
= re.sub(r'^[\d\.]+\s*[-–—]?\s*[\d\.]*\s*[-–—]\s*', '', sector).strip()
sector
# Return np.nan if empty, otherwise capitalize first letter
return np.nan if not sector else sector[0].upper() + sector[1:]
print("test cleaning:")
= ":54 – Professional, scientific and technical services"
test_case print(f"'{test_case}' → '{clean_industry_sector(test_case)}'")
'industry_sector'] = df['industry_sector'].apply(clean_industry_sector) df[
test cleaning:
':54 – Professional, scientific and technical services' → 'Professional, scientific and technical services'
Data Formatting
# Trim whitespace in string fields
= ['business_name', 'city', 'industry_sector','region','type']
text_cols = df[text_cols].apply(lambda x: x.str.strip()) df[text_cols]
Save the Cleaned data
# Save cleaned data
"../data/clean/bcindigenousbiz.csv", index=False) df.to_csv(
Validation of Cleaned Data
# Validate cleaned data
= pd.read_csv("../data/clean/bcindigenousbiz.csv")
clean_dataprint(f"Final cleaned dataset rows: {len(clean_data)}") # Final row count
clean_data.head()
Final cleaned dataset rows: 1259
business_name | city | latitude | longitude | region | type | industry_sector | year_formed | number_of_employees | |
---|---|---|---|---|---|---|---|---|---|
0 | Ellipsis Energy Inc | Moberly Lake | 55.819370 | -121.834602 | Northeast | Private Company | Mining, quarrying, and oil and gas extraction | 2012.0 | 5 to 9 |
1 | Indigenous Community Development & Prosperity ... | Enderby | 50.551498 | -119.133546 | Thompson / Okanagan | Private Company | Other services (except public administration) | 2020.0 | 1 to 4 |
2 | Formline Construction Ltd. | Burnaby | 49.266050 | 123.005840 | Lower Mainland / Southwest | Private Company | Construction | 2021.0 | 1 to 4 |
3 | Quilakwa Investments Ltd. | Enderby | 50.537507 | -119.141955 | Thompson / Okanagan | Community Owned Company | Accommodation and food services | 1984.0 | 20 to 49 |
4 | Quilakwa Esso | Enderby | 50.537507 | -119.141955 | Thompson / Okanagan | Community Owned Company | Retail trade | 1984.0 | 10 to 19 |