Diverse Data Hub
  • Data Sets
  • Citation
  • Collaborate
  1. LGBTIQ-rights Dataset Cleaning Steps

On this page

  • LGBTIQ+-Specific Datasets
  • Python Code to Analyze Each Dataset

LGBTIQ-rights Dataset Cleaning Steps

LGBTIQ+-Specific Datasets

First, we will examine how many unique years and countries are included across the different datasets.

Python Code to Analyze Each Dataset

import pandas as pd

# Function to summarize unique years and countries
def summarize_years_and_countries(df, dataset_name):
    return pd.DataFrame({
        "Dataset": [dataset_name],
        "Unique Years": [df["Year"].nunique()],
        "Unique Countries": [df["Entity"].nunique()]
    })



# Censorship of LGBTIQ Issues
df_censorship = pd.read_csv("../data/raw/LGBTIQ-rights/censorship-of-lgbtiq-issues/censorship-of-lgbtiq-issues.csv")
censorship_summary = summarize_years_and_countries(df_censorship, "Censorship of LGBTIQ Issues")

# Employment Discrimination
df_employment_discrimination = pd.read_csv("../data/raw/LGBTIQ-rights/employment-discrimination/employment-discrimination-lgbt-equaldex.csv")
employment_discrimination_summary = summarize_years_and_countries(df_employment_discrimination, "Employment Discrimination")

# Economic Inequality (Gini Index)
df_gini = pd.read_csv("../data/raw/LGBTIQ-rights/economic-inequality-gini-index/economic-inequality-gini-index.csv")

gini_summary = summarize_years_and_countries(df_gini, "Economic Inequality (Gini Index)")

# GDP per Capita
df_gdp = pd.read_csv("../data/raw/LGBTIQ-rights/gdp-per-capita/gdp-per-capita-worldbank.csv")
gdp_summary = summarize_years_and_countries(df_gdp, "GDP per Capita")

# Government Expenditure on Education
df_education = pd.read_csv("../data/raw/LGBTIQ-rights/government-expenditure-on-education/total-government-expenditure-on-education-gdp.csv")
education_summary = summarize_years_and_countries(df_education, "Government Education Expenditure")

# Gender-Affirming Care
df_gender_care = pd.read_csv("../data/raw/LGBTIQ-rights/gender-affirming-care/gender-affirming-care.csv")
gender_care_summary = summarize_years_and_countries(df_gender_care, "Gender-Affirming Care")

# Same-Sex Marriage Rights
df_marriage = pd.read_csv("../data/raw/LGBTIQ-rights/marriage-same-sex-partners/marriage-same-sex-partners-equaldex.csv")
marriage_summary = summarize_years_and_countries(df_marriage, "Same-Sex Marriage")

# Legal Gender Change Rights
df_legal_gender = pd.read_csv("../data/raw/LGBTIQ-rights/right-to-change-legal-gender/right-to-change-legal-gender-equaldex.csv")
legal_gender_summary = summarize_years_and_countries(df_legal_gender, "Legal Gender Change")

def summarize_countries_per_year(df, dataset_name):
    return (
        df.groupby("Year")["Entity"]
        .nunique()
        .reset_index(name="num_countries")
        .assign(Dataset=dataset_name)
    )



censorship_summary = summarize_countries_per_year(df_censorship, "Censorship of LGBTIQ Issues")
employment_summary = summarize_countries_per_year(df_employment_discrimination, "Employment Discrimination")
gini_summary = summarize_countries_per_year(df_gini, "Economic Inequality (Gini Index)")
gdp_summary = summarize_countries_per_year(df_gdp, "GDP per Capita")
education_summary = summarize_countries_per_year(df_education, "Government Education Expenditure")
gender_care_summary = summarize_countries_per_year(df_gender_care, "Gender-Affirming Care")
marriage_summary = summarize_countries_per_year(df_marriage, "Same-Sex Marriage")
legal_gender_summary = summarize_countries_per_year(df_legal_gender, "Legal Gender Change")

# Combine
summary_all = pd.concat([
    censorship_summary,
    employment_summary,
    gini_summary,
    gdp_summary,
    education_summary,
    gender_care_summary,
    marriage_summary,
    legal_gender_summary
], ignore_index=True)

# Pivot
summary_pivot = summary_all.pivot_table(
    index="Dataset",
    columns="Year",
    values="num_countries",
    fill_value=0
).reset_index()

summary_pivot
Year Dataset 1870 1913 1937 1950 1951 1952 1953 1954 1955 ... 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
0 Censorship of LGBTIQ Issues 0.0 0.0 0.0 4.0 4.0 4.0 4.0 4.0 4.0 ... 34.0 37.0 37.0 40.0 41.0 43.0 46.0 48.0 50.0 194.0
1 Economic Inequality (Gini Index) 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 87.0 83.0 96.0 82.0 70.0 75.0 28.0 4.0 0.0 0.0
2 Employment Discrimination 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 63.0 66.0 73.0 78.0 81.0 84.0 86.0 87.0 88.0 183.0
3 GDP per Capita 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 213.0 213.0 213.0 213.0 213.0 213.0 212.0 206.0 0.0 0.0
4 Gender-Affirming Care 0.0 0.0 0.0 9.0 9.0 9.0 9.0 9.0 9.0 ... 66.0 67.0 70.0 71.0 75.0 76.0 79.0 81.0 83.0 164.0
5 Government Education Expenditure 5.0 8.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 184.0 187.0 182.0 184.0 181.0 180.0 149.0 66.0 0.0 0.0
6 Legal Gender Change 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 87.0 91.0 95.0 98.0 98.0 98.0 102.0 103.0 103.0 194.0
7 Same-Sex Marriage 0.0 0.0 0.0 12.0 13.0 13.0 14.0 14.0 14.0 ... 126.0 128.0 131.0 134.0 135.0 135.0 135.0 138.0 138.0 194.0

8 rows × 80 columns

import matplotlib.pyplot as plt
import seaborn as sns
df = summary_all.copy()
print(df)
     Year  num_countries                      Dataset
0    1950              4  Censorship of LGBTIQ Issues
1    1951              4  Censorship of LGBTIQ Issues
2    1952              4  Censorship of LGBTIQ Issues
3    1953              4  Censorship of LGBTIQ Issues
4    1954              4  Censorship of LGBTIQ Issues
..    ...            ...                          ...
497  2021             98          Legal Gender Change
498  2022            102          Legal Gender Change
499  2023            103          Legal Gender Change
500  2024            103          Legal Gender Change
501  2025            194          Legal Gender Change

[502 rows x 3 columns]
import matplotlib.pyplot as plt

for dataset in df["Dataset"].unique():
    subset = df[df["Dataset"] == dataset]

    if subset["num_countries"].notna().sum() == 0:
        print(f"Skipping {dataset} – all values are null")
        continue

    plt.figure(figsize=(8, 4))
    plt.plot(subset["Year"], subset["num_countries"], marker='o', linestyle='-')
    plt.title(f"Non-Null Value Count per Year – {dataset}")
    plt.xlabel("Year")
    plt.ylabel("Count of countries that has data")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

## So based on the result, I think after 2000, we have an acceptable number of countries that have data ( all of them at least have 50 coountires after this year with the valid data)
## So I filter the dataset for after 2000, please let me know if you want to change this
## This frame would be our base table for our left join 
## summary_all
## I've noticed that for some csv files. we do not have code, so I just want to left join on Year and Country

columns_to_check = ["Year", "Entity", "Code"]

dataframes = {
    "df_censorship": df_censorship,
    "df_employment_discrimination": df_employment_discrimination,
    "df_gini": df_gini,
    "df_gdp": df_gdp,
    "df_education": df_education,
    "df_gender_care": df_gender_care,
    "df_marriage": df_marriage,
    "df_legal_gender": df_legal_gender,
}

for name, df in dataframes.items():
    print(f"\n{name} - Nulls in Key Columns:")
    for col in columns_to_check:
        if col in df.columns:
            null_count = df[col].isnull().sum()
            print(f"  {col}: {null_count} null(s)")
        else:
            print(f"  {col}: Column not found")

df_censorship - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_employment_discrimination - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_gini - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 133 null(s)

df_gdp - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 458 null(s)

df_education - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 428 null(s)

df_gender_care - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_marriage - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)

df_legal_gender - Nulls in Key Columns:
  Year: 0 null(s)
  Entity: 0 null(s)
  Code: 0 null(s)
# First, I combined all datasets into one long DataFrame 
## Please notice that stacking all the rows from these datasets on top of each other
all_data = pd.concat([
    df_censorship[["Entity", "Code", "Year"]],
    df_employment_discrimination[["Entity","Code", "Year"]],
    df_gini[["Entity","Code", "Year"]],
    df_gdp[["Entity","Code", "Year"]],
    df_education[["Entity","Code", "Year"]],
    df_gender_care[["Entity","Code", "Year"]],
    df_marriage[["Entity","Code", "Year"]],
    df_legal_gender[["Entity","Code", "Year"]],
], ignore_index=True)

##
## Show rows where 'Code' is null
null_code_rows = all_data[all_data["Code"].isna()]
print(null_code_rows)
                                Entity Code  Year
3108                 Argentina (urban)  NaN  1980
3109                 Argentina (urban)  NaN  1986
3110                 Argentina (urban)  NaN  1987
3111                 Argentina (urban)  NaN  1991
3112                 Argentina (urban)  NaN  1992
...                                ...  ...   ...
18012  Western and Central Africa (WB)  NaN  2018
18013  Western and Central Africa (WB)  NaN  2019
18014  Western and Central Africa (WB)  NaN  2020
18015  Western and Central Africa (WB)  NaN  2021
18016  Western and Central Africa (WB)  NaN  2022

[1019 rows x 3 columns]
import pandas as pd

## Create a mapping from Entity → Code using non-null values
entity_to_code_map = (
    all_data[all_data["Code"].notna()]
    .drop_duplicates(subset="Entity")
    .set_index("Entity")["Code"]
    .to_dict()
)

## Create lists to store matched and unmatched entity names
matched_entities = []
unmatched_entities = []

## Fill missing codes and track matches
def fill_code_and_track(row):
    if pd.isna(row["Code"]):
        matched_code = entity_to_code_map.get(row["Entity"])
        if matched_code is not None:
            matched_entities.append(row["Entity"])
            return matched_code
        else:
            unmatched_entities.append(row["Entity"])
            return None
    return row["Code"]

all_data["Code"] = all_data.apply(fill_code_and_track, axis=1)

## Remove duplicates from tracking lists
matched_entities = sorted(set(matched_entities))
unmatched_entities = sorted(set(unmatched_entities))

print("✅ Finished filling missing Code values.")
print(f"✅ {len(matched_entities)} entities were successfully matched:")
for entity in matched_entities:
    print("  ✓", entity)

print("\n❗ Entities with no Code found:")
for entity in unmatched_entities:
    print("  ✗", entity)

print(f"\n🧮 Remaining rows with missing Code: {all_data['Code'].isna().sum()}")
## As we can see for these countries, we do not have code, please let mw know, what should we do for them 
✅ Finished filling missing Code values.
✅ 0 entities were successfully matched:

❗ Entities with no Code found:
  ✗ Arab World (WB)
  ✗ Argentina (urban)
  ✗ Bolivia (urban)
  ✗ Central Europe and the Baltics (WB)
  ✗ China (rural)
  ✗ China (urban)
  ✗ Colombia (urban)
  ✗ EU (27)
  ✗ East Asia and Pacific (WB)
  ✗ East Asia and the Pacific (WB)
  ✗ Ecuador (urban)
  ✗ Ethiopia (rural)
  ✗ Europe and Central Asia (WB)
  ✗ European Union (27)
  ✗ Faeroe Islands
  ✗ High-income countries
  ✗ Honduras (urban)
  ✗ India (rural)
  ✗ India (urban)
  ✗ Latin America and Caribbean (WB)
  ✗ Low-income countries
  ✗ Lower-middle-income countries
  ✗ Micronesia (country) (urban)
  ✗ Middle East and North Africa (WB)
  ✗ Middle-income countries
  ✗ North America (WB)
  ✗ Rwanda (rural)
  ✗ South Asia (WB)
  ✗ Southern and Eastern Africa (WB)
  ✗ Sub-Saharan Africa (WB)
  ✗ Suriname (urban)
  ✗ Upper-middle-income countries
  ✗ Uruguay (urban)
  ✗ Western and Central Africa (WB)

🧮 Remaining rows with missing Code: 1019
## So I just created my base table based on Year and Country and filtered it out for Year > 2000  and <= 2023
filtered_data = all_data[(all_data["Year"] > 2000) & (all_data["Year"] <= 2023)]
base_table = filtered_data[["Year", "Entity", "Code"]].drop_duplicates()
base_table = base_table.sort_values(by=["Year", "Entity"]).reset_index(drop=True)
base_table
Year Entity Code
0 2001 Afghanistan AFG
1 2001 Albania ALB
2 2001 Algeria DZA
3 2001 Andorra AND
4 2001 Angola AGO
... ... ... ...
5265 2023 Vietnam VNM
5266 2023 World OWID_WRL
5267 2023 Yemen YEM
5268 2023 Zambia ZMB
5269 2023 Zimbabwe ZWE

5270 rows × 3 columns

merged_wide_df = base_table.copy()
datasets = {
    "censorship": df_censorship,
    "employment": df_employment_discrimination,
    "gini": df_gini,
    "gdp": df_gdp,
    "education": df_education,
    "gendercare": df_gender_care,
    "marriage": df_marriage,
    "legalgender": df_legal_gender,
}


for name, df in datasets.items():

    value_cols = [col for col in df.columns if col not in ["Entity", "Year"]]
    df_renamed = df.rename(columns={col: f"{col}_{name}" for col in value_cols})
    merged_wide_df = merged_wide_df.merge(
        df_renamed,
        on=["Entity", "Year"],
        how="left"
    )

print("Final shape:", merged_wide_df.shape)
Final shape: (5270, 20)
merged_wide_df.head()
Year Entity Code Code_censorship Censorship of LGBT+ issues (historical)_censorship Code_employment LGBT+ employment discrimination (historical)_employment Code_gini Gini coefficient_gini 990179-annotations_gini Code_gdp GDP per capita, PPP (constant 2021 international $)_gdp Code_education Public spending on education as a share of GDP_education Code_gendercare Gender-affirming care (historical)_gendercare Code_marriage Same-sex marriage (historical)_marriage Code_legalgender Right to change legal gender (historical)_legalgender
0 2001 Afghanistan AFG NaN NaN NaN NaN NaN NaN NaN AFG 1454.1108 NaN NaN NaN NaN AFG Banned NaN NaN
1 2001 Albania ALB NaN NaN NaN NaN NaN NaN NaN ALB 7215.8200 ALB 3.4587 NaN NaN NaN NaN NaN NaN
2 2001 Algeria DZA DZA Imprisonment as punishment NaN NaN NaN NaN NaN DZA 11742.5950 NaN NaN NaN NaN DZA Banned NaN NaN
3 2001 Andorra AND NaN NaN NaN NaN NaN NaN NaN AND 59109.0160 NaN NaN AND Restricted NaN NaN NaN NaN
4 2001 Angola AGO NaN NaN NaN NaN NaN NaN NaN AGO 6049.1630 NaN NaN NaN NaN NaN NaN NaN NaN
print("Long format shape:", merged_wide_df.shape)
Long format shape: (5270, 20)
list(merged_wide_df.columns)
['Year',
 'Entity',
 'Code',
 'Code_censorship',
 'Censorship of LGBT+ issues (historical)_censorship',
 'Code_employment',
 'LGBT+ employment discrimination (historical)_employment',
 'Code_gini',
 'Gini coefficient_gini',
 '990179-annotations_gini',
 'Code_gdp',
 'GDP per capita, PPP (constant 2021 international $)_gdp',
 'Code_education',
 'Public spending on education as a share of GDP_education',
 'Code_gendercare',
 'Gender-affirming care (historical)_gendercare',
 'Code_marriage',
 'Same-sex marriage (historical)_marriage',
 'Code_legalgender',
 'Right to change legal gender (historical)_legalgender']
# Drop all columns that start with 'Code_'
merged_wide_df = merged_wide_df.loc[:, ~merged_wide_df.columns.str.contains("Code_")]
list(merged_wide_df.columns)
['Year',
 'Entity',
 'Code',
 'Censorship of LGBT+ issues (historical)_censorship',
 'LGBT+ employment discrimination (historical)_employment',
 'Gini coefficient_gini',
 '990179-annotations_gini',
 'GDP per capita, PPP (constant 2021 international $)_gdp',
 'Public spending on education as a share of GDP_education',
 'Gender-affirming care (historical)_gendercare',
 'Same-sex marriage (historical)_marriage',
 'Right to change legal gender (historical)_legalgender']
exclude_cols = ["Entity", "Year"]
value_cols = merged_wide_df.columns.difference(exclude_cols)
cleaned_long_df = merged_wide_df.dropna(subset=value_cols, how='all').reset_index(drop=True)
print("Shape after cleaning:", cleaned_long_df.shape)
cleaned_long_df
Shape after cleaning: (5270, 12)
Year Entity Code Censorship of LGBT+ issues (historical)_censorship LGBT+ employment discrimination (historical)_employment Gini coefficient_gini 990179-annotations_gini GDP per capita, PPP (constant 2021 international $)_gdp Public spending on education as a share of GDP_education Gender-affirming care (historical)_gendercare Same-sex marriage (historical)_marriage Right to change legal gender (historical)_legalgender
0 2001 Afghanistan AFG NaN NaN NaN NaN 1454.1108 NaN NaN Banned NaN
1 2001 Albania ALB NaN NaN NaN NaN 7215.8200 3.4587 NaN NaN NaN
2 2001 Algeria DZA Imprisonment as punishment NaN NaN NaN 11742.5950 NaN NaN Banned NaN
3 2001 Andorra AND NaN NaN NaN NaN 59109.0160 NaN Restricted NaN NaN
4 2001 Angola AGO NaN NaN NaN NaN 6049.1630 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
5265 2023 Vietnam VNM NaN NaN NaN NaN 13491.8800 NaN Legal Banned Legal, surgery required
5266 2023 World OWID_WRL NaN NaN NaN NaN 20670.9410 NaN NaN NaN NaN
5267 2023 Yemen YEM NaN NaN NaN NaN NaN NaN NaN Banned NaN
5268 2023 Zambia ZMB Imprisonment as punishment NaN NaN NaN 3673.4841 NaN NaN Banned NaN
5269 2023 Zimbabwe ZWE Imprisonment as punishment NaN NaN NaN 3442.2512 NaN NaN Banned NaN

5270 rows × 12 columns

cleaned_long_df.tail()
Year Entity Code Censorship of LGBT+ issues (historical)_censorship LGBT+ employment discrimination (historical)_employment Gini coefficient_gini 990179-annotations_gini GDP per capita, PPP (constant 2021 international $)_gdp Public spending on education as a share of GDP_education Gender-affirming care (historical)_gendercare Same-sex marriage (historical)_marriage Right to change legal gender (historical)_legalgender
5265 2023 Vietnam VNM NaN NaN NaN NaN 13491.8800 NaN Legal Banned Legal, surgery required
5266 2023 World OWID_WRL NaN NaN NaN NaN 20670.9410 NaN NaN NaN NaN
5267 2023 Yemen YEM NaN NaN NaN NaN NaN NaN NaN Banned NaN
5268 2023 Zambia ZMB Imprisonment as punishment NaN NaN NaN 3673.4841 NaN NaN Banned NaN
5269 2023 Zimbabwe ZWE Imprisonment as punishment NaN NaN NaN 3442.2512 NaN NaN Banned NaN
print(cleaned_long_df.columns.tolist())
['Year', 'Entity', 'Code', 'Censorship of LGBT+ issues (historical)_censorship', 'LGBT+ employment discrimination (historical)_employment', 'Gini coefficient_gini', '990179-annotations_gini', 'GDP per capita, PPP (constant 2021 international $)_gdp', 'Public spending on education as a share of GDP_education', 'Gender-affirming care (historical)_gendercare', 'Same-sex marriage (historical)_marriage', 'Right to change legal gender (historical)_legalgender']
cleaned_long_df.columns = (
    cleaned_long_df.columns
    .str.replace("Entity", "country", regex=False)
    .str.replace("Year", "year", regex=False)
    .str.replace("Code", "country-code", regex=False)
    .str.replace("Censorship of LGBT+ issues (historical)_censorship", "lgbtq-censorship", regex=False)
    .str.replace("LGBT+ employment discrimination (historical)_employment", "employment-discrimination", regex=False)
    .str.replace("Gini coefficient_gini", "gini-index", regex=False)
    .str.replace("GDP per capita, PPP (constant 2021 international $)_gdp", "gdp-per-capita", regex=False)
    .str.replace("Public spending on education as a share of GDP_education", "education-spending-gdp", regex=False)
    .str.replace("Gender-affirming care (historical)_gendercare", "gender-affirming-care", regex=False)
    .str.replace("Same-sex marriage (historical)_marriage", "same-sex-marriage", regex=False)
    .str.replace("Right to change legal gender (historical)_legalgender", "legal-gender", regex=False)
)
cleaned_long_df
year country country-code lgbtq-censorship employment-discrimination gini-index 990179-annotations_gini gdp-per-capita education-spending-gdp gender-affirming-care same-sex-marriage legal-gender
0 2001 Afghanistan AFG NaN NaN NaN NaN 1454.1108 NaN NaN Banned NaN
1 2001 Albania ALB NaN NaN NaN NaN 7215.8200 3.4587 NaN NaN NaN
2 2001 Algeria DZA Imprisonment as punishment NaN NaN NaN 11742.5950 NaN NaN Banned NaN
3 2001 Andorra AND NaN NaN NaN NaN 59109.0160 NaN Restricted NaN NaN
4 2001 Angola AGO NaN NaN NaN NaN 6049.1630 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
5265 2023 Vietnam VNM NaN NaN NaN NaN 13491.8800 NaN Legal Banned Legal, surgery required
5266 2023 World OWID_WRL NaN NaN NaN NaN 20670.9410 NaN NaN NaN NaN
5267 2023 Yemen YEM NaN NaN NaN NaN NaN NaN NaN Banned NaN
5268 2023 Zambia ZMB Imprisonment as punishment NaN NaN NaN 3673.4841 NaN NaN Banned NaN
5269 2023 Zimbabwe ZWE Imprisonment as punishment NaN NaN NaN 3442.2512 NaN NaN Banned NaN

5270 rows × 12 columns

print(cleaned_long_df.dtypes)
year                           int64
country                       object
country-code                  object
lgbtq-censorship              object
employment-discrimination     object
gini-index                   float64
990179-annotations_gini      float64
gdp-per-capita               float64
education-spending-gdp       float64
gender-affirming-care         object
same-sex-marriage             object
legal-gender                  object
dtype: object
float_cols = cleaned_long_df.select_dtypes(include=["float64"]).columns
cleaned_long_df[float_cols] = cleaned_long_df[float_cols].round(2)
eda_summary = pd.DataFrame({
    "Non-Null Count": cleaned_long_df.notnull().sum(),
    "Total Rows": len(cleaned_long_df),
    "Non-Null %": cleaned_long_df.notnull().mean() * 100,
    "Data Type": cleaned_long_df.dtypes
})


eda_summary = eda_summary.sort_values(by="Non-Null Count", ascending=False)
eda_summary["Non-Null %"] = eda_summary["Non-Null %"].round(1)
eda_summary.reset_index(inplace=True)
eda_summary.rename(columns={"index": "Column"}, inplace=True)


eda_summary
Column Non-Null Count Total Rows Non-Null % Data Type
0 year 5270 5270 100.0 int64
1 country 5270 5270 100.0 object
2 gdp-per-capita 4841 5270 91.9 float64
3 country-code 4772 5270 90.6 object
4 education-spending-gdp 3601 5270 68.3 float64
5 same-sex-marriage 2560 5270 48.6 object
6 legal-gender 1703 5270 32.3 object
7 gini-index 1664 5270 31.6 float64
8 gender-affirming-care 1414 5270 26.8 object
9 employment-discrimination 1225 5270 23.2 object
10 lgbtq-censorship 720 5270 13.7 object
11 990179-annotations_gini 0 5270 0.0 float64
## Again Dropping some columns based on Non-Null %
columns_to_drop = ['990179-annotations_gini']
cleaned_long_df = cleaned_long_df.drop(columns=columns_to_drop)
## One more time checking the columns 
eda_summary = pd.DataFrame({
    "Non-Null Count": cleaned_long_df.notnull().sum(),
    "Total Rows": len(cleaned_long_df),
    "Non-Null %": cleaned_long_df.notnull().mean() * 100,
    "Data Type": cleaned_long_df.dtypes
})


eda_summary = eda_summary.sort_values(by="Non-Null Count", ascending=False)
eda_summary["Non-Null %"] = eda_summary["Non-Null %"].round(1)
eda_summary.reset_index(inplace=True)
eda_summary.rename(columns={"index": "Column"}, inplace=True)


eda_summary
Column Non-Null Count Total Rows Non-Null % Data Type
0 year 5270 5270 100.0 int64
1 country 5270 5270 100.0 object
2 gdp-per-capita 4841 5270 91.9 float64
3 country-code 4772 5270 90.6 object
4 education-spending-gdp 3601 5270 68.3 float64
5 same-sex-marriage 2560 5270 48.6 object
6 legal-gender 1703 5270 32.3 object
7 gini-index 1664 5270 31.6 float64
8 gender-affirming-care 1414 5270 26.8 object
9 employment-discrimination 1225 5270 23.2 object
10 lgbtq-censorship 720 5270 13.7 object
print(cleaned_long_df.columns.tolist())
['year', 'country', 'country-code', 'lgbtq-censorship', 'employment-discrimination', 'gini-index', 'gdp-per-capita', 'education-spending-gdp', 'gender-affirming-care', 'same-sex-marriage', 'legal-gender']
#3 Checking for each country, how many missing values we have for each column 
summary = cleaned_long_df.groupby("country").agg(
    total_rows=("country", "size"),
    missing_gdp_per_capita=("gdp-per-capita", lambda x: x.isna().sum()),
    missing_same_sex_marriage=("same-sex-marriage", lambda x: x.isna().sum()),
    missing_legal_gender=("legal-gender", lambda x: x.isna().sum()),
    missing_lgbtq_censorship=("lgbtq-censorship", lambda x: x.isna().sum()),
    missing_employment_discrimination=("employment-discrimination", lambda x: x.isna().sum()),
    missing_gender_affirming_care=("gender-affirming-care", lambda x: x.isna().sum())
)
summary["missing_count"] = (
    summary["missing_gdp_per_capita"]
    + summary["missing_same_sex_marriage"]
    + summary["missing_same_sex_marriage"] 
    + summary["missing_legal_gender"]
    + summary["missing_lgbtq_censorship"]
    + summary["missing_employment_discrimination"]
    + summary["missing_gender_affirming_care"]
    + summary["missing_legal_gender"] 
)

summary_sorted = summary.sort_values("missing_count", ascending=False)
summary_sorted[summary_sorted["missing_count"] > 100]   ###
total_rows missing_gdp_per_capita missing_same_sex_marriage missing_legal_gender missing_lgbtq_censorship missing_employment_discrimination missing_gender_affirming_care missing_count
country
Western and Central Africa (WB) 22 22 22 22 22 22 22 176
Argentina (urban) 21 21 21 21 21 21 21 168
EU (27) 21 21 21 21 21 21 21 168
Central Europe and the Baltics (WB) 21 21 21 21 21 21 21 168
Bermuda 23 0 23 23 23 23 23 161
... ... ... ... ... ... ... ... ...
Sint Maarten (Dutch part) 15 0 15 15 15 15 15 105
Albania 23 0 2 23 23 9 23 105
Georgia 23 0 23 0 23 13 23 105
Malta 23 0 13 14 23 3 23 103
Cameroon 23 0 0 23 9 23 23 101

140 rows × 8 columns

## Based on the top table, some countries have missing values for almost all columns, so we are going to delete them instead of deleting the columns
countries_to_exclude = summary_sorted[summary_sorted["missing_count"] > 100].index.tolist()
cleaned_long_df = cleaned_long_df[~cleaned_long_df["country"].isin(countries_to_exclude)]
cleaned_long_df = cleaned_long_df.reset_index(drop=True)
cleaned_long_df.head()
year country country-code lgbtq-censorship employment-discrimination gini-index gdp-per-capita education-spending-gdp gender-affirming-care same-sex-marriage legal-gender
0 2001 Algeria DZA Imprisonment as punishment NaN NaN 11742.60 NaN NaN Banned NaN
1 2001 Andorra AND NaN NaN NaN 59109.02 NaN Restricted NaN NaN
2 2001 Argentina ARG NaN NaN NaN 21066.46 4.83 NaN NaN NaN
3 2001 Armenia ARM NaN NaN 0.35 5044.12 2.47 Legal, but restricted for minors NaN Legal, surgery required
4 2001 Australia AUS Varies by region NaN 0.33 44667.93 5.22 Legal NaN Varies by region
 ## One more time checking the columns 
eda_summary = pd.DataFrame({
    "Non-Null Count": cleaned_long_df.notnull().sum(),
    "Total Rows": len(cleaned_long_df),
    "Non-Null %": cleaned_long_df.notnull().mean() * 100,
    "Data Type": cleaned_long_df.dtypes
})


eda_summary = eda_summary.sort_values(by="Non-Null Count", ascending=False)
eda_summary["Non-Null %"] = eda_summary["Non-Null %"].round(1)
eda_summary.reset_index(inplace=True)
eda_summary.rename(columns={"index": "Column"}, inplace=True)


eda_summary
Column Non-Null Count Total Rows Non-Null % Data Type
0 year 2125 2125 100.0 int64
1 country 2125 2125 100.0 object
2 country-code 2094 2125 98.5 object
3 gdp-per-capita 2023 2125 95.2 float64
4 same-sex-marriage 1700 2125 80.0 object
5 education-spending-gdp 1677 2125 78.9 float64
6 legal-gender 1419 2125 66.8 object
7 gini-index 1173 2125 55.2 float64
8 gender-affirming-care 1152 2125 54.2 object
9 employment-discrimination 988 2125 46.5 object
10 lgbtq-censorship 521 2125 24.5 object
# Desired column order
column_order = [
    "year", "country", "country-code",
    "gdp-per-capita", "education-spending-gdp",
    "same-sex-marriage", "lgbtq-censorship",
    "employment-discrimination", "gender-affirming-care",
    "legal-gender"
]

cleaned_long_df = cleaned_long_df[column_order]

cleaned_long_df.to_csv("../data/clean/globalrights.csv", index=False)
 
 

This page is built with Quarto.