Diverse Data Hub
  • Data Sets
  • Citation
  • Collaborate
    import pandas as pd
    import numpy as np
    import re

    Load the raw data

    # Load the data
    df = pd.read_csv("../data/raw/indigenous-business/bcindigenousbusinesslistings.csv")

    Inspecting the data

    # Inspect the data
    print(df.info())
    print(df.head())
    print(f"Initial number of rows: {len(df)}") 
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 1259 entries, 0 to 1258
    Data columns (total 12 columns):
     #   Column               Non-Null Count  Dtype  
    ---  ------               --------------  -----  
     0   Business Name        1259 non-null   object 
     1   Description          1135 non-null   object 
     2   Web Site             699 non-null    object 
     3   City                 1258 non-null   object 
     4   Latitude             1258 non-null   float64
     5   Longitude            1258 non-null   float64
     6   Keywords             1257 non-null   object 
     7   Region               1259 non-null   object 
     8   Type                 1123 non-null   object 
     9   Industry Sector      1222 non-null   object 
     10  Year Formed          648 non-null    float64
     11  Number of Employees  572 non-null    object 
    dtypes: float64(3), object(9)
    memory usage: 118.2+ KB
    None
                                           Business Name  \
    0                                Ellipsis Energy Inc   
    1  Indigenous Community Development & Prosperity ...   
    2                         Formline Construction Ltd.   
    3                          Quilakwa Investments Ltd.   
    4                                      Quilakwa Esso   
    
                                             Description  \
    0  Ellipsis Energy Inc is an Aboriginal owned com...   
    1  ICDPRO works together with Indigenous communit...   
    2  With over combined 30 years of experience in t...   
    3  Quilakwa Investments Ltd. oversees several Ind...   
    4  Quilakwa Esso is owned by the Splatsin Indian ...   
    
                                          Web Site           City   Latitude  \
    0                 http://www.ellipsisenergy.ca  Moberly Lake   55.819370   
    1  https://indigenouscommunitydevelopment.com/        Enderby  50.551498   
    2                        https://www.flcon.ca/        Burnaby  49.266050   
    3                    http://www.splatsindc.com        Enderby  50.537507   
    4                                          NaN        Enderby  50.537507   
    
        Longitude                                           Keywords  \
    0 -121.834602  Ellipsis Energy Inc 21 – Mining, quarrying, an...   
    1 -119.133546  Indigenous Community Development & Prosperity ...   
    2  123.005840       Formline Construction Ltd. 23 – Construction   
    3 -119.141955                          Quilakwa Investments Ltd.   
    4 -119.141955                 Quilakwa Esso 44-45 - Retail trade   
    
                           Region                     Type  \
    0                   Northeast          Private Company   
    1         Thompson / Okanagan          Private Company   
    2  Lower Mainland / Southwest          Private Company   
    3         Thompson / Okanagan  Community Owned Company   
    4         Thompson / Okanagan  Community Owned Company   
    
                                         Industry Sector  Year Formed  \
    0  21 – Mining, quarrying, and oil and gas extrac...       2012.0   
    1  81 – Other services (except public administrat...       2020.0   
    2                                  23 – Construction       2021.0   
    3               72 – Accommodation and food services       1984.0   
    4                               44-45 - Retail trade       1984.0   
    
      Number of Employees  
    0              5 to 9  
    1              1 to 4  
    2              1 to 4  
    3            20 to 49  
    4            10 to 19  
    Initial number of rows: 1259

    Column Name Standardization

    #  Clean column names (convert to lowercase and replace spaces with underscores)
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    Remove Unnecessary Columns

    # Remove unnecessary columns
    columns_to_drop = ['description', 'web_site', 'keywords']
    df = df.drop(columns=columns_to_drop, errors='ignore')

    Removal of Duplicates

    # Remove duplicate rows
    df = df.drop_duplicates()
    # check no of rows after removing duplicates
    print(f"No of rows after removing duplicates: {len(df)}") 
    No of rows after removing duplicates: 1259

    Critical Data Validation

    # Remove rows missing critical information
    #business_name is a mandatory field here
    if 'business_name' in df.columns:
        df = df[df['business_name'].notna() & (df['business_name'] != '')]
    # check no of rows after removing rows missing critical information
    print(f"No of rows: {len(df)}") 
    No of rows: 1259

    Ensure Year is an integer

    # Ensure year_formed is a nullable integer
    df['year_formed'] = pd.to_numeric(df['year_formed'], errors='coerce').astype('Int64')

    Cleanup industry_sector

    # custom function to standardize industry_sector data
    def clean_industry_sector(sector):
        if pd.isna(sector):
            return np.nan
        
        # Convert to string
        sector = str(sector).strip()
        
        # Handle cases starting with colon
        if sector.startswith(':'):
            sector = sector[1:].strip()
        
        # Remove ALL number patterns including:
        # "23 - ", "44-45 - ", "1.5 - ", "54 – " (with en dash)
        sector = re.sub(r'^[\d\.]+\s*[-–—]?\s*[\d\.]*\s*[-–—]\s*', '', sector).strip()
        
        # Return np.nan if empty, otherwise capitalize first letter
        return np.nan if not sector else sector[0].upper() + sector[1:]
    
    
    print("test cleaning:")
    test_case = ":54 – Professional, scientific and technical services"
    print(f"'{test_case}' → '{clean_industry_sector(test_case)}'")
    
    
    df['industry_sector'] = df['industry_sector'].apply(clean_industry_sector)
    test cleaning:
    ':54 – Professional, scientific and technical services' → 'Professional, scientific and technical services'

    Data Formatting

    # Trim whitespace in string fields
    text_cols = ['business_name', 'city', 'industry_sector','region','type']
    df[text_cols] = df[text_cols].apply(lambda x: x.str.strip())

    Save the Cleaned data

    # Save cleaned data
    df.to_csv("../data/clean/bcindigenousbiz.csv", index=False)

    Validation of Cleaned Data

    # Validate cleaned data
    clean_data= pd.read_csv("../data/clean/bcindigenousbiz.csv")
    print(f"Final cleaned dataset rows: {len(clean_data)}")  # Final row count
    clean_data.head()
    Final cleaned dataset rows: 1259
    business_name city latitude longitude region type industry_sector year_formed number_of_employees
    0 Ellipsis Energy Inc Moberly Lake 55.819370 -121.834602 Northeast Private Company Mining, quarrying, and oil and gas extraction 2012.0 5 to 9
    1 Indigenous Community Development & Prosperity ... Enderby 50.551498 -119.133546 Thompson / Okanagan Private Company Other services (except public administration) 2020.0 1 to 4
    2 Formline Construction Ltd. Burnaby 49.266050 123.005840 Lower Mainland / Southwest Private Company Construction 2021.0 1 to 4
    3 Quilakwa Investments Ltd. Enderby 50.537507 -119.141955 Thompson / Okanagan Community Owned Company Accommodation and food services 1984.0 20 to 49
    4 Quilakwa Esso Enderby 50.537507 -119.141955 Thompson / Okanagan Community Owned Company Retail trade 1984.0 10 to 19
     
     

    This page is built with Quarto.