import pandas as pd
Clean up for HCMST_2017-2022.csv
Loading CSV
Load the necessary libraries for the clean-up, as well as the provided dataset
= pd.read_csv("../data/raw/how-couples-meet-and-stay-together/HCMST_2017-2022.csv")
df df.head()
caseid_new | w3_Weight | w3_Weight_LGB | w3_combo_weight | w3_attrition_adj_weight | w2_weight_genpop | w2_weight_LGB | w2_combo_weight | w2_attrition_adj_weights | w1_weight_combo | w1_weight_combo_freqwt | w3_xpartner_type | w3_xlast_contact_year | w3_xcohab | w3_xsamesex | w3_xlast_contact_mo | w3_xnamep_present | w3_duration | w3_gen_pop_sample | w3_ppage | w3_ppagecat | w3_ppagect4 | w3_ppeduc | w3_ppeducat | w3_ppethm | w3_ppgender | w3_pphhhead | w3_pphhsize | w3_pphouse | w3_ppincimp | w3_real_inc | w3_log_real_inc | w3_ppmarit | w3_ppmsacat | w3_PPREG4 | w3_ppreg9 | w3_pprent | w3_PPT01 | w3_PPT25 | w3_PPT612 | w3_total_kids_lt13 | w3_PPT1317 | w3_PPT18OV | w3_ppwork | w3_surveyed | w3_section | w3_partner_type | w3_married | w3_otherdate | w3_coronavirus_effect_combo | w3_corona_effect_combo_reversed | w3_live_w_partner | w3_same_sex_couple | w3_marriage_dp | w3_mar_dp_why_words | w3_mar_dp_why_len | w3_mar_dp_why_sum | w3_rel_qual | w3_Q23 | w3_sex_frequency | w3_weekly_sex_frequency | w3_how_many | w3_how_many_corona | w3_p_monogamy | w3_monogamy | w3_past_partners_gender_1 | w3_past_partners_gender_2 | w3_past_partners_gender_3 | w3_move_in_yr | w3_move_in_mo | w3_when_married_yr | w3_when_married_month | w3_why_not_cohab | w3_how_often | w3_how_communicate_1 | w3_how_communicate_2 | w3_how_communicate_3 | w3_how_communicate_4 | w3_how_communicate_5 | w3_flirt | w3_flirt_other | w3_fight | w3_shelter | w3_shelter_4 | w3_nonshelter | w3_shelter_combo_months | w3_WFH | w3_more_WFH | w3_WFH_future | w3_myhealth | w3_myhealth_why_1 | w3_myhealth_why_2 | w3_subject_had_COVID | w3_corona_vaccine | w3_partner_corona_vaccine | w3_vaccine_same | w3_corona_test | w3_parthealth | w3_parthealth_why_1 | w3_parthealth_why_2 | w3_partner_had_COVID | w3_partcorona_test | w3_pandemic_income | w3_pandemic_income_collapsed | w3_covid_masking | w3_COVID_agreement | w3_covid_complete_agree | w3_government_response | w3_relatives | w3_relationship_end_combo | w3_who_breakup | w3_women_wanted_breakup | w3_partner_type_for_het_breakups | w3_breakup_year | w3_breakup_month | w3_partner_passaway_year | w3_partner_passaway_month | w3_new_relationship | w3_coronavirus_effect_dating | w3_corona_longing | w3_app_use | w3_Q4 | w3_Q5 | w3_Q6A | w3_Q6B | w3_Q9 | w3_Q10 | w3_Q11 | w3_Q12 | w3_Q20 | w3_how_far_apart | w3_Q21A_year | w3_Q21A_month | w3_Q21B_year | w3_Q21B_month | w3_Q21C_year | w3_Q21C_month | w3_Q21D_year | w3_Q21D_month | w3_Q21E_year | w3_Q21E_month | w3_q24_length | w3_q24_R_friend | w3_q24_P_friend | w3_q24_I_friend | w3_q24_R_cowork | w3_q24_R_family | w3_q24_R_sig_other | w3_q24_R_neighbor | w3_q24_P_cowork | w3_q24_P_family | w3_q24_P_sig_other | w3_q24_P_neighbor | w3_q24_I_cowork | w3_q24_I_family | w3_q24_I_sig_other | w3_q24_I_neighbor | w3_q24_school | w3_q24_college | w3_q24_mil | w3_q24_church | w3_q24_vol_org | w3_q24_customer | w3_q24_bar_restaurant | w3_q24_party | w3_q24_internet_other | w3_q24_internet_dating | w3_q24_internet_soc_network | w3_q24_internet_game | w3_q24_internet_chat | w3_q24_internet_org | w3_q24_public | w3_q24_blind_date | w3_q24_vacation | w3_q24_singles_serve_nonint | w3_q24_business_trip | w3_q24_work_neighbors | w3_q24_summary_all_codes | w3_q24_met_online | w3_q24_met_through_friend | w3_q24_met_through_family | w3_q24_met_through_as_nghbrs | w3_q24_met_as_through_cowork | w3_Q25 | w3_Q26 | w3_Q27 | w3_Q28 | w3_Q32 | w3_friend_connect_1 | w3_friend_connect_2 | w3_friend_connect_3 | w3_friend_connect_4 | w3_partner_source | w3_month_rel_started | w3_month_interview | w3_relationship_duration_mos | w3_relationship_duration_yrs | w3_breakup_source | w2_month_of_survey | w2_xpartner_type | xpartner_type_cohab | w2_xcohab | w2_xpppa_lgb_num | w2_lgb | w2_duration | w2_ppage | w2_ppagecat | w2_ppagect4 | w2_ppeduc | w2_ppeducat | w2_BA | w2_ppethm | w2_ppgender | w2_pphhhead | w2_pphhsize | w2_pphouse | w2_ppincimp | w2_income_real | w2_log_real_inc | w2_ppmsacat | w2_PPREG4 | w2_ppreg9 | w2_pprent | w2_PPT01 | w2_PPT25 | w2_PPT612 | w2_kids_lt13 | w2_total_kids_lt13 | w2_PPT1317 | w2_PPT18OV | w2_ppwork | w2_surveyed | w2_coronavirus_effect_combo | w2_corona_effect_combo_reversed | w2_pandemic_income_combo | w2_app_use_combo | w2_rel_qual_combo | w2_rel_qual_reduced | w2_partner_type | w2_married | w2_partner_type_cohab | w2_live_w_partner | w2_Q20 | w2_same_sex_couple | w2_lesbian | w2_cohab | w2_section | w2_Q21B_yr | w2_q24_length | w2_q24_R_friend | w2_q24_P_friend | w2_q24_I_friend | w2_q24_R_cowork | w2_q24_R_family | w2_q24_R_sig_other | w2_q24_R_neighbor | w2_q24_P_cowork | w2_q24_P_family | w2_q24_P_sig_other | w2_q24_P_neighbor | w2_q24_I_cowork | w2_q24_I_family | w2_q24_I_sig_other | w2_q24_I_neighbor | w2_q24_school | w2_q24_college | w2_q24_mil | w2_q24_church | w2_q24_vol_org | w2_q24_customer | w2_q24_bar_restaurant | w2_q24_party | w2_q24_internet_other | w2_q24_internet_dating | w2_q24_internet_soc_network | w2_q24_internet_game | w2_q24_internet_chat | w2_q24_internet_org | w2_q24_public | w2_q24_blind_date | w2_q24_vacation | w2_q24_singles_serve_nonint | w2_q24_business_trip | w2_q24_work_neighbors | w2_q24_summary_all_codes_test | w2_q24_met_through_friend | w2_q24_met_through_family | w2_q24_met_through_as_nghbrs | w2_q24_met_as_through_cowork | w2_q24_met_online | w2_q24_answered | w2_q24_coworkers | w2_who_breakup_combo | yr_breakup_combo | w2_otherdate_combo | w2_how_many_combo | w2_how_many_corona_combo | w2_number_people_met | w2_corona_effect_dating_combo | w2_shelter_combo_months | w2_corona_longing_combo | w2_breakup_year | w2_breakup_month | w2_covid_breakup | w2_breakup_month_tm | w2_yr_relationship_started | w2_relationship_start_calmonth | w2_new_relationship | w2_q21a_year | w2_q21a_month | w2_q21b_year | w2_q21b_month | w2_q21c_year | w2_q21c_month | w2_q21d_year | w2_q21d_month | w2_q21e_year | w2_q21e_month | w2_met_month | w2_sex_frequency | w2_weekly_sex_frequency | w2_relationship_duration | w2_relatives | w2_women_wanted_breakup | w2_partner_type_for_het_breakups | w2_Q23_old | w2_Q23_new | w2_p_monogamy | w2_Q21A_1 | w2_Q21A_2 | w2_Q21B_1 | w2_Q21B_2 | w2_move_in_yr | w2_move_in_mo | w2_move_in_2020 | w2_separation_timing_1_1 | w2_separation_timing_1_2 | w2_separation_timing_2020 | w2_how_far_apart | w2_why_not_cohab | w2_how_often | w2_how_communicate_1 | w2_how_communicate_2 | w2_how_communicate_3 | w2_how_communicate_4 | w2_how_communicate_5 | w2_flirt | w2_flirt_other | w2_fight | w2_nonshelter | w2_myhealth | w2_myhealth_why_1 | w2_myhealth_why_2 | w2_corona_test | w2_parthealth | w2_parthealth_why_1 | w2_parthealth_why_2 | w2_partcorona_test | w2_partcorona_test_new | w2_roommates | w2_roommates_who_1 | w2_roommates_who_2 | w2_roommates_who_3 | w2_roommates_who_4 | w2_exroommates | w2_relationship_end | w2_relationship_end_yr | w2_relationship_end_mo | w2_partner_passaway_yr | w2_partner_passaway_month | w2_Q4 | w2_Q5 | w2_Q6A | w2_Q6B | w2_Q9 | w2_Q10 | w2_Q11 | w2_Q12 | w2_live_together_2020 | w2_relationship_progress_2020 | w2_Q25 | w2_Q26 | w2_Q27 | w2_Q28 | w2_Q32 | w2_friend_connect_1 | w2_friend_connect_2 | w2_friend_connect_3 | w2_friend_connect_4 | w1_duration | w1_speed_flag | w1_consent | w1_partyid7 | w1_PERSNET_hom | w1_ppc10017 | w1_ppc21310 | w1_ppp20071 | w1_ppp20072 | w1_ppage | w1_ppagecat | w1_ppagect4 | w1_ppeduc | w1_ppeducat | w1_ppethm | w1_ppRace_1 | w1_ppRace_2 | w1_ppRace_3 | w1_ppRace_4 | w1_ppRace_5 | w1_ppRace_6 | w1_pprace1 | w1_pprace2 | w1_pprace3 | w1_pprace4 | w1_pprace5 | w1_pprace6 | w1_pprace7 | w1_pprace8 | w1_pprace9 | w1_pprace10 | w1_pprace11 | w1_pprace12 | w1_pprace13 | w1_pprace14 | w1_pprace15 | w1_ppgender | w1_pphhhead | w1_pphhsize | w1_pphouse | w1_ppincimp | w1_ppincimp_cat | w1_ppmarit | w1_ppmsacat | w1_PPREG4 | w1_ppreg9 | w1_pprent | w1_PPT01 | w1_PPT25 | w1_PPT612 | w1_PPT1317 | w1_PPT18OV | w1_ppwork | w1_xlgb | w1_section | w1_partnership_status | w1_partnership_status_cohab | w1_female | w1_sex_frequency | w1_otherdate | w1_how_many | w1_how_meet_Refused | w1_otherdate_app | w1_how_many_app | w1_took_the_survey | w1_prior_identity_lgb | w1_same_sex_couple | w1_same_sex_couple_gender | w1_q4 | w1_q5 | w1_q6a | w1_q6b | w1_q9 | w1_q10 | w1_q11 | w1_q12 | w1_q14 | w1_q15a1 | w1_q15a2 | w1_q15a4 | w1_q15a5 | w1_q15a7 | w1_q16 | w1_q17 | w1_attraction | w1_q19 | w1_q20 | w1_q21a_year | w1_q21a_month | w1_q21a_month_flag | w1_q21b_year | w1_q21b_month | w1_q21b_month_flag | w1_q21c_year | w1_q21c_month | w1_q21c_month_flag | w1_q21d_year | w1_q21d_month | w1_q21e_year | w1_q21e_month | w1_q21f_year | w1_q21f_month | w1_identity_all | w1_identity_all_modified | w1_outness_all | w1_outness_timing_all | w1_q23 | w1_q24_length | w1_q24_R_cowork | w1_q24_R_friend | w1_q24_R_family | w1_q24_R_sig_other | w1_q24_R_neighbor | w1_q24_P_cowork | w1_q24_P_friend | w1_q24_P_family | w1_q24_P_sig_other | w1_q24_P_neighbor | w1_q24_I_cowork | w1_q24_I_friend | w1_q24_I_family | w1_q24_I_sig_other | w1_q24_I_neighbor | w1_q24_school | w1_q24_college | w1_q24_mil | w1_q24_church | w1_q24_vol_org | w1_q24_customer | w1_q24_bar_restaurant | w1_q24_party | w1_q24_internet_other | w1_q24_internet_dating | w1_q24_internet_soc_network | w1_q24_internet_game | w1_q24_internet_chat | w1_q24_internet_org | w1_q24_public | w1_q24_blind_date | w1_q24_vacation | w1_q24_singles_serve_nonint | w1_q24_business_trip | w1_q24_work_neighbors | w1_q24_met_online | w1_q24_summary_all_codes | w1_q24_summary_all_codes_test | w1_q24_met_through_family | w1_q24_met_through_friend | w1_q24_met_through_as_nghbrs | w1_q24_met_as_through_cowork | w1_q25 | w1_q26 | w1_q27 | w1_q28 | w1_friend_connect_1_all | w1_friend_connect_2_all | w1_friend_connect_3_all | w1_friend_connect_4_all | w1_q32 | w1_q34 | w1_q34_reduced | w1_otherdate_all | w1_how_many_all | w1_otherdate_app_all | w1_how_many_app_all | w1_q40 | w1_number_people_met | w1_otherdate_dichotomous | w1_married | w1_relate_duration_in2017_years | w1_number_people_met_app | w1_q40_phone_app_incl_other | w1_year_fraction_met | w1_year_fraction_relstart | w1_age_when_met | w1_time_from_met_to_rel | w1_year_fraction_first_cohab | w1_time_from_rel_to_cohab | w1_subject_race | w1_interracial_5cat | w1_partner_mother_yrsed | w1_subject_mother_yrsed | w1_partner_yrsed | w1_subject_yrsed | w1_either_internet_adjusted | w1_how_met_online | w1_met_after_2009 | w1_q24_metonline_no_phone_apps | w1_q32_met_online_phone_apps | w1_max_relation_status | w1_smart_phone | w1_days_access_internet_perwk | w1_year_met_from_1990 | w1_ever_marriedq21 | change_in_rel_quality_w1w2 | w1_relationship_start_calmonth | w1_met_month | w1_weekly_sex_frequency | w1w2_sex_freq_diff | p17_core_emp | p17_ppcmdate2017 | p17_pppa_lgb | p17_pppa1634 | p17_pppa1635 | p17_pppa1648 | p17_ppp20072 | p17_ppp20071 | p17_ppp10206 | p17_ppp10207 | p17_ppp10208 | p17_ppp10209 | p17_ppp10210 | p17_ppp10211 | p17_ppp10212 | p17_ppp10213 | p17_pppa1704 | p17_pppa1705 | p17_pppadate2017 | p17_ppfs0596 | p17_ppfs1480 | p17_ppfs1482 | p17_ppfsdate2017 | p18_core_emp | p18_ppmarit | p18_ppincimp | p18_ppcmdate2018 | p18_pppa_lgb | p18_pppa1634 | p18_pppa1635 | p18_pppa1648 | p18_ppp20072 | p18_ppp20071 | p18_ppp10206 | p18_ppp10207 | p18_ppp10208 | p18_ppp10209 | p18_ppp10210 | p18_ppp10211 | p18_ppp10212 | p18_ppp10213 | p18_pppa1704 | p18_pppa1705 | p18_pppa1802 | p18_pppa1803 | p18_pppadate2018 | p18_ppfs0596 | p18_ppfs1480 | p18_ppfs1482 | p18_ppfsdate2018 | p19_core_emp | p19_ppmarit | p19_ppincimp | p19_ppcmdate2019 | p19_pppa_lgb | p19_pppa1634 | p19_pppa1902 | p19_pppa1903 | p19_pppa1904 | p19_pppa1905 | p19_pppa1648 | p19_ppp20072 | p19_ppp20071 | p19_ppp10206 | p19_ppp10207 | p19_ppp10208 | p19_ppp10209 | p19_ppp10210 | p19_ppp10211 | p19_ppp10212 | p19_ppp10213 | p19_pppa1704 | p19_pppa1705 | p19_pppa1802 | p19_pppa1803 | p19_pppadate2019 | p19_ppfs0596 | p19_ppfs1480 | p19_ppfs1482 | p19_ppfsdate2019 | p19_ppsi1927 | p19_ppsi1928 | p19_ppsi1929 | p19_ppsi1930 | p19_ppsi1931 | p19_ppsi1932 | p19_ppsi1933 | p19_ppsi1934 | p19_ppsi1935 | p19_ppsi1936 | p19_ppsidate2019 | p20_core_emp | p20_ppmarit | p20_ppcmdate2020 | p20_ppfs0596 | p20_ppfs1480 | p20_ppfs1482 | p20_ppfsdate2020 | p20_ppsi1927 | p20_ppsi1928 | p20_ppsi1929 | p20_ppsi1930 | p20_ppsi1931 | p20_ppsi1932 | p20_ppsi1933 | p20_ppsi1934 | p20_ppsi1935 | p20_ppsi1936 | p20_ppsidate2020 | p20_ppp10206 | p20_ppp10207 | p20_ppp10208 | p20_ppp10209 | p20_ppp10210 | p20_ppp10211 | p20_ppp10212 | p20_pppa1704 | p20_pppa1705 | p20_pppa1802 | p20_pppa1803 | p20_ppp12002 | p20_ppp12003 | p20_ppp12004 | p20_ppp12005 | p20_ppp10213 | p20_ppp1date2020 | p20_pppagnid | p20_pppa1634 | p20_pppa1902 | p20_pppa1903 | p20_pppa1904 | p20_ppp22001 | p20_pppa1905 | p20_pppa1648 | p20_ppp20072 | p20_ppp20071 | p20_ppp2date2020 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 0.4422 | NaN | 0.495308 | 0.400185 | 0.3856 | NaN | 0.437670 | 0.380351 | 0.426861 | 29628 | 2 | 2020 | 2.0 | 2.0 | 9 | 1 | 5.88 | 1.0 | 53.0 | 4.0 | 3.0 | 9.0 | 2.0 | 1.0 | 2.0 | 0.0 | 2.0 | 1.0 | 10.0 | 37500.0 | 10.473937 | 3.0 | 0.0 | 2.0 | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1 | 3.0 | 2 | 0.0 | NaN | 1.0 | 3.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | 1.0 | 3.0 | 3.0 | 1.50 | NaN | NaN | 1.0 | 1.0 | NaN | 19.0 | NaN | 2021.0 | 11.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 6.0 | 1.0 | 3.0 | NaN | NaN | 0.75 | 4.0 | 0.0 | 2.0 | 2.0 | NaN | NaN | 0.0 | 4.0 | 4.0 | 1.0 | 2.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 3.0 | 3.0 | 2.0 | 1.0 | 1.0 | 3.0 | 3.0 | 0.0 | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 728.0 | 746.0 | 18.0 | 1.500000 | NaN | 728.0 | 1.0 | 1.0 | 1.0 | 2.0 | 0.0 | 13.550000 | 51.0 | 4.0 | 3.0 | 9.0 | 2.0 | 0.0 | 1.0 | 2.0 | 1.0 | 2.0 | 1.0 | 10.0 | 37500.0 | 10.532096 | 0.0 | 2.0 | 3.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1 | 2.0 | 2.0 | 3.0 | NaN | 2.0 | 2.0 | 2 | 0.0 | 3.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2020.0 | 247.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1 | 0.0 | 1.0 | 2020.0 | 2.0 | 3.0 | NaN | 13.0 | NaN | 0.25 | NaN | 2020.0 | 4.0 | 0.0 | 723.0 | 2020.0 | 728.0 | 3.0 | 2020.0 | 9.0 | 2020.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | 728.0 | 5.0 | 0.250 | 0.0 | 4.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 2020.0 | 9.0 | 2020.0 | 9.0 | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | 2.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 6.0 | 1.0 | 1.0 | NaN | 2.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | 3.0 | 1.0 | 2020.0 | 4.0 | NaN | NaN | 1.0 | NaN | 1.0 | 1.0 | 51.0 | 13.0 | 9.0 | 3.0 | NaN | 1.0 | 2.0 | NaN | 2.0 | 2.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 4158 | 0 | 1 | 6.0 | 1 | 1 | 1 | -2.0 | 6.0 | 48 | 4 | 3 | 9 | 2 | 5 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 1 | 1 | 13 | 2 | 1 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 3.0 | 1.0 | NaN | NaN | NaN | NaN | 1 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 1.0 | 1.0 | 46.0 | 11.0 | 14.0 | 1.0 | 9.0 | 1.0 | 14.0 | 1.0 | 14.0 | 1.0 | 7.0 | 3.0 | 1.0 | 1.0 | NaN | 2013.0 | 6.0 | 0.0 | 2013.0 | 12.0 | 0.0 | 2013.0 | 12.0 | NaN | 2014.0 | 9.0 | NaN | NaN | NaN | NaN | 1.0 | 1 | NaN | NaN | 3.0 | 130.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | 3.583333 | NaN | NaN | 2013.458374 | 2013.958374 | 44.0 | 0.500000 | 2013.958374 | 0.000000 | 3.0 | 1.0 | 20.0 | 12.0 | 14.0 | 12.0 | 0.0 | NaN | 1.0 | 0.0 | 0.0 | 2.0 | 1.0 | 30.0 | 23.0 | 1 | 1.0 | 647.0 | 641.0 | 1.500 | -1.25 | 0.0 | 20170807.0 | 2.0 | 2.0 | 2.0 | 13.0 | 6.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 20171025.0 | 1.0 | 3.0 | 5.0 | 20171227.0 | 0.0 | 1.0 | 13.0 | 20180820.0 | 2.0 | 2.0 | 2.0 | 13.0 | 6.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20190621.0 | 2.0 | 3.0 | 5.0 | 20190317.0 | 0.0 | 1.0 | 10.0 | 20191101.0 | 2.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 11.0 | 6.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200603.0 | 2.0 | 4.0 | 5.0 | 20200306.0 | 2.0 | 4.0 | 3.0 | 2.0 | 3.0 | 3.0 | 4.0 | 4.0 | 4.0 | 4.0 | 20190716.0 | 0.0 | 3.0 | 20201215.0 | 2.0 | 3.0 | 5.0 | 20210222.0 | 3.0 | 4.0 | 2.0 | 3.0 | 2.0 | 4.0 | 4.0 | 4.0 | 5.0 | 5.0 | 20201016.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200913.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 13.0 | 6.0 | NaN | 20210506.0 |
1 | 71609 | 0.8284 | NaN | 0.927891 | 0.879258 | 0.9196 | NaN | 1.043778 | 0.953948 | 1.295508 | 89921 | 1 | 2020 | 1.0 | 2.0 | 9 | 1 | 10.47 | 1.0 | 72.0 | 6.0 | 4.0 | 10.0 | 3.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 14.0 | 80000.0 | 11.231623 | 1.0 | 0.0 | 4.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1 | 1.0 | 1 | 1.0 | NaN | 2.0 | 2.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | 2.0 | 1.0 | 5.0 | 0.25 | NaN | NaN | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 6.0 | 8.0 | 1.0 | NaN | 1.0 | 0.00 | 4.0 | 0.0 | 4.0 | 2.0 | NaN | NaN | 0.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | NaN | NaN | 0.0 | 2.0 | 2.0 | 2.0 | 2.0 | 2.0 | 0.0 | 3.0 | 11.0 | 0.0 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 57.0 | 746.0 | 689.0 | 57.416668 | NaN | 728.0 | 1.0 | 1.0 | 1.0 | 2.0 | 0.0 | 31.116667 | 71.0 | 6.0 | 4.0 | 10.0 | 3.0 | 0.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 | 13.0 | 67500.0 | 11.119883 | 0.0 | 4.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1 | 2.0 | 2.0 | 3.0 | NaN | 1.0 | 1.0 | 1 | 1.0 | 1.0 | 1.0 | NaN | 0.0 | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | NaN | NaN | NaN | NaN | NaN | 1964.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 | 0.250 | 56.0 | 6.0 | NaN | 1.0 | 3.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 6.0 | 4.0 | 1.0 | 2.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 38 | 0 | 1 | 3.0 | 1 | 2 | 1 | 1.0 | 4.0 | 68 | 6 | 4 | 10 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 2 | 1 | 12 | 2 | 1 | 0 | 4 | 9 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 5.0 | 1.0 | NaN | NaN | NaN | NaN | 1 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 1.0 | 1.0 | 71.0 | 10.0 | 9.0 | 3.0 | 5.0 | 1.0 | 37.0 | 1.0 | 5.0 | 1.0 | 7.0 | 1.0 | 1.0 | 1.0 | NaN | 1964.0 | 8.0 | 0.0 | 1964.0 | 10.0 | 0.0 | 1969.0 | 3.0 | NaN | 1969.0 | 3.0 | NaN | NaN | NaN | NaN | 1.0 | 1 | NaN | NaN | 3.0 | 400.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 4.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | 52.750000 | NaN | NaN | 1964.625000 | 1964.791626 | 15.0 | 0.166626 | 1969.208374 | 4.416748 | 1.0 | 0.0 | 12.0 | 9.0 | 13.0 | 13.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 30.0 | -26.0 | 1 | 0.0 | 57.0 | 55.0 | 0.250 | 0.00 | 1.0 | 20170702.0 | 2.0 | 2.0 | 2.0 | 1.0 | 4.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 20180122.0 | 7.0 | 4.0 | 5.0 | 20180216.0 | 0.0 | 1.0 | 11.0 | 20180910.0 | 2.0 | 2.0 | 2.0 | 1.0 | 4.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 20190311.0 | 6.0 | 4.0 | 4.0 | 20190318.0 | 0.0 | 1.0 | 13.0 | 20191209.0 | 2.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 4.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20190912.0 | 1.0 | 4.0 | 5.0 | 20200515.0 | 4.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 4.0 | 4.0 | 2.0 | 5.0 | 20190728.0 | 1.0 | 1.0 | 20210207.0 | NaN | NaN | NaN | NaN | 3.0 | 5.0 | 1.0 | 3.0 | 3.0 | 2.0 | 3.0 | 5.0 | 1.0 | 5.0 | 20201118.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 20200917.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 1.0 | 20201118.0 |
2 | 106983 | 0.8255 | NaN | 0.924643 | 0.706467 | 0.7748 | NaN | 0.879425 | 0.724682 | 1.126573 | 78196 | 1 | 2020 | 1.0 | 2.0 | 9 | 1 | 4.43 | 1.0 | 43.0 | 3.0 | 2.0 | 11.0 | 3.0 | 1.0 | 1.0 | 0.0 | 5.0 | 1.0 | 14.0 | 80000.0 | 11.231623 | 1.0 | 1.0 | 4.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 | 1.0 | 1 | 1.0 | 1 | 1.0 | NaN | 2.0 | 2.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | 1.0 | 1.0 | 3.0 | 1.50 | NaN | NaN | 1.0 | 1.0 | 10.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 6.0 | 3.0 | 1.0 | NaN | 1.0 | 0.00 | 4.0 | 0.0 | 4.0 | 2.0 | NaN | NaN | 0.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | NaN | NaN | 0.0 | 2.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 1.0 | 6.0 | 0.0 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 479.0 | 747.0 | 268.0 | 22.333334 | NaN | 728.0 | 1.0 | 1.0 | 1.0 | 2.0 | 0.0 | 485.633333 | 42.0 | 3.0 | 2.0 | 11.0 | 3.0 | 0.0 | 1.0 | 1.0 | 0.0 | 5.0 | 1.0 | 15.0 | 92500.0 | 11.434964 | 1.0 | 4.0 | 9.0 | 1.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 1 | 2.0 | 2.0 | 2.0 | NaN | 1.0 | 1.0 | 1 | 1.0 | 1.0 | 1.0 | NaN | 0.0 | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | NaN | NaN | NaN | NaN | NaN | 1999.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | 1.500 | 21.0 | 5.0 | NaN | 1.0 | 1.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 6.0 | 3.0 | 1.0 | 2.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 72 | 0 | 1 | 7.0 | 1 | 1 | 1 | 2.0 | 4.0 | 39 | 3 | 2 | 11 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 5 | 1 | 15 | 3 | 1 | 1 | 4 | 9 | 1 | 0 | 0 | 2 | 0 | 3 | 1 | 2 | 1 | 1 | 1 | 0 | 3.0 | 1.0 | NaN | NaN | NaN | NaN | 1 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 1.0 | 1.0 | 49.0 | 10.0 | 13.0 | 7.0 | 12.0 | 1.0 | 12.0 | 1.0 | 12.0 | 1.0 | 5.0 | 1.0 | 1.0 | 1.0 | NaN | 1999.0 | 3.0 | 0.0 | 1999.0 | 12.0 | 0.0 | 2000.0 | 4.0 | NaN | 2002.0 | 9.0 | NaN | NaN | NaN | NaN | 1.0 | 1 | NaN | NaN | 1.0 | 210.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2.0 | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | 17.583334 | NaN | NaN | 1999.208374 | 1999.958374 | 21.0 | 0.750000 | 2000.291626 | 0.333252 | 1.0 | 0.0 | 17.0 | 16.0 | 13.0 | 14.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 30.0 | 9.0 | 1 | 0.0 | 479.0 | 470.0 | 1.500 | 0.00 | 0.0 | 20170624.0 | 2.0 | 1.0 | 1.0 | 1.0 | 4.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 20170907.0 | 1.0 | 2.0 | 4.0 | 20171216.0 | 0.0 | 1.0 | 15.0 | 20180822.0 | 2.0 | 1.0 | 1.0 | 1.0 | 4.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20190612.0 | 3.0 | 3.0 | 4.0 | 20190420.0 | 0.0 | 1.0 | 15.0 | 20191104.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | 2.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200517.0 | 1.0 | 2.0 | 4.0 | 20200421.0 | 4.0 | 4.0 | 3.0 | 4.0 | 2.0 | 4.0 | 3.0 | 5.0 | 4.0 | 3.0 | 20190713.0 | 0.0 | 1.0 | 20201222.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200910.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 | 2.0 | 20210429.0 |
3 | 121759 | NaN | NaN | NaN | NaN | 0.9177 | NaN | 1.041622 | 0.793093 | 0.933440 | 64790 | 1 | 2020 | 1.0 | 2.0 | 9 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 728.0 | 1.0 | 1.0 | 1.0 | 2.0 | 0.0 | 22.600000 | 57.0 | 5.0 | 3.0 | 9.0 | 2.0 | 0.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1.0 | 16.0 | 112500.0 | 11.630709 | 1.0 | 1.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1 | 2.0 | 2.0 | 3.0 | NaN | 2.0 | 2.0 | 1 | 1.0 | 1.0 | 1.0 | NaN | 0.0 | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.00 | NaN | NaN | NaN | NaN | NaN | 1990.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | 0.625 | 31.0 | 6.0 | NaN | 1.0 | 3.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | 6.0 | 1.0 | 1.0 | 2.0 | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 25 | 0 | 1 | 2.0 | 1 | 1 | 1 | 1.0 | 2.0 | 54 | 4 | 3 | 9 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 4 | 1 | 16 | 3 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 4 | 1 | 2 | 1 | 1 | 1 | 0 | 4.0 | 1.0 | NaN | NaN | NaN | NaN | 1 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 1.0 | 4.0 | 59.0 | 13.0 | 1.0 | 4.0 | 14.0 | 1.0 | 33.0 | 1.0 | 33.0 | 1.0 | 3.0 | 1.0 | 1.0 | 1.0 | NaN | 1989.0 | 10.0 | 0.0 | 1990.0 | 2.0 | 0.0 | 1991.0 | 4.0 | NaN | 1991.0 | 3.0 | NaN | NaN | NaN | NaN | 1.0 | 1 | NaN | NaN | 3.0 | 423.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | NaN | 2.0 | 2.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | 27.416666 | NaN | NaN | 1989.791626 | 1990.125000 | 26.0 | 0.333374 | 1991.291626 | 1.166626 | 1.0 | 1.0 | 0.0 | 20.0 | 17.0 | 12.0 | 0.0 | NaN | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 30.0 | -1.0 | 1 | 1.0 | 361.0 | 357.0 | 0.625 | 0.00 | 0.0 | 20170814.0 | 2.0 | 1.0 | 1.0 | 11.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 20170914.0 | 1.0 | 4.0 | 5.0 | 20171212.0 | 1.0 | 1.0 | 18.0 | 20180819.0 | 2.0 | 1.0 | 1.0 | 11.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20190613.0 | 3.0 | 4.0 | 5.0 | 20190330.0 | 1.0 | 1.0 | 16.0 | 20191103.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 11.0 | 2.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200517.0 | 5.0 | 4.0 | 5.0 | 20200308.0 | 5.0 | 2.0 | 2.0 | 5.0 | 4.0 | 4.0 | 4.0 | 5.0 | 2.0 | 5.0 | 20190721.0 | 1.0 | 1.0 | 20210114.0 | 2.0 | 4.0 | 5.0 | 20210224.0 | 4.0 | 3.0 | 1.0 | 4.0 | 4.0 | 4.0 | 2.0 | 4.0 | 1.0 | 5.0 | 20201017.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200917.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 2.0 | 1.0 | 20210507.0 |
4 | 158083 | 0.8810 | NaN | 0.986809 | 0.655467 | 0.8697 | NaN | 0.987140 | 0.735473 | 0.931291 | 64641 | 3 | 2020 | NaN | NaN | 9 | 0 | 5.34 | 1.0 | 53.0 | 4.0 | 3.0 | 10.0 | 3.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 18.0 | 162500.0 | 11.940274 | 3.0 | 1.0 | 4.0 | 8.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 1.0 | 1 | 6.0 | 3 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | 5.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | NaN | 4.0 | 2.0 | NaN | 2.00 | 1.0 | 1.0 | 1.0 | 2.0 | NaN | NaN | 0.0 | 1.0 | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | 3.0 | 3.0 | 1.0 | NaN | NaN | 1.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | 2.0 | 2.0 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 746.0 | NaN | NaN | NaN | 728.0 | 3.0 | 4.0 | NaN | 2.0 | 0.0 | 3.083333 | 52.0 | 4.0 | 3.0 | 10.0 | 3.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.0 | 1.0 | 18.0 | 162500.0 | 11.998433 | 1.0 | 4.0 | 8.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | 1.0 | 1 | NaN | NaN | 3.0 | 2.0 | NaN | NaN | 3 | 0.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | 6.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | NaN | NaN | 1.0 | NaN | 2.0 | 0.0 | 2.0 | 0.75 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.0 | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | NaN | NaN | NaN | NaN | 3.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 12 | 0 | 1 | 3.0 | 1 | 1 | 1 | -2.0 | 5.0 | 48 | 4 | 3 | 10 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 14 | 3 | 3 | 1 | 4 | 8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 2 | 3 | 4 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 0.0 | 0.0 | 0.0 | 2.0 | NaN | 1.0 | 1.0 | 34.0 | 11.0 | 11.0 | 2.0 | 9.0 | 1.0 | 29.0 | 1.0 | 29.0 | 1.0 | 0.0 | 3.0 | 1.0 | NaN | 1.0 | 2011.0 | 8.0 | 0.0 | 2011.0 | 8.0 | 0.0 | 2012.0 | 1.0 | NaN | 2013.0 | 6.0 | 2014.0 | 6.0 | NaN | NaN | 1.0 | 1 | NaN | NaN | 1.0 | 186.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 4.0 | 0.0 | 1.0 | 0.0 | 1.0 | 2.0 | 2.0 | 2.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | NaN | NaN | NaN | 2011.625000 | 2011.625000 | 42.0 | 0.000000 | 2012.041626 | 0.416626 | 1.0 | 0.0 | 14.0 | 12.0 | 14.0 | 13.0 | 0.0 | NaN | 1.0 | 0.0 | 0.0 | 2.0 | 1.0 | 30.0 | 21.0 | 1 | NaN | 619.0 | 619.0 | NaN | NaN | 0.0 | 20170914.0 | 2.0 | 1.0 | 1.0 | 13.0 | 5.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 20171013.0 | 3.0 | 4.0 | 4.0 | 20180127.0 | 0.0 | 3.0 | 18.0 | 20180831.0 | 4.0 | 1.0 | 1.0 | 13.0 | 6.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20190611.0 | 1.0 | 4.0 | 4.0 | 20190217.0 | 0.0 | 3.0 | 18.0 | 20191101.0 | 2.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 13.0 | 6.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 20200529.0 | 1.0 | 4.0 | 4.0 | 20200124.0 | 5.0 | 2.0 | 4.0 | 4.0 | 1.0 | 1.0 | 4.0 | 4.0 | 2.0 | 5.0 | 20190712.0 | 0.0 | 3.0 | 20201215.0 | 3.0 | 4.0 | 4.0 | 20210201.0 | 5.0 | 1.0 | 4.0 | 4.0 | 1.0 | 1.0 | 2.0 | 4.0 | 1.0 | 5.0 | 20201104.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 20200916.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 13.0 | 6.0 | NaN | 20210602.0 |
df.shape
(3510, 725)
Full List of Variables
The next cell displays the full list of variables in the original data set. We are not going to use everything…
We’ll focus on variables of interest for specific inquiries.
In the following links, we can find the encoding of each variable.
Data source: https://data.stanford.edu/hcmst2017
Variable List per year (Note w1 = 2017, w2 = 2020, w3 = 2022): https://stacks.stanford.edu/file/druid:tq903pj6286/HCMST%202017-%202022%20user%27s%20guide%20v2.3.pdf
Detailed Info on variables: https://stacks.stanford.edu/file/druid:hg921sg6829/HCMST%202017%20to%202022%20v2.2%20codebook.pdf
#df.columns.tolist()
Variables of Interest
The following lists are not actually used in the rest of the notebook for data cleaning purposes.
Still, they display the selected variables of interest (and how their names vary from year to year), so, we are leaving them in the notebook for documentation.
# ppage = Age
# ppeduc = Education (Highest Degree Received)
# ppgender = Gender
# ppethm = Race / Ethnicity
# ppincimp = Household Income
# ppwork = Current Employment Status
= [
subject_demographics_2017 'w1_ppage', 'w1_ppeduc', 'w1_ppgender', 'w1_ppethm', 'w1_ppincimp', 'w1_ppwork'
]= [
subject_demographics_2020 'w2_ppage', 'w2_ppeduc', 'w2_ppgender', 'w2_ppethm', 'w2_ppincimp', 'w2_ppwork'
]= [
subject_demographics_2022 'w3_ppage', 'w3_ppeduc', 'w3_ppgender', 'w3_ppethm', 'w3_ppincimp', 'w3_ppwork'
]
# q4 = Partner Gender
# q9 = Partner Age
# q6b = Partner's Race
# q10 = Partner's Education (Highest Degree Received)
= [
partner_demographics_2017 'w1_q4', 'w1_q9', 'w1_q6b', 'w1_q10'
]= [
partner_demographics_2020 'w2_Q4', 'w2_Q9', 'w2_Q6B', 'w2_Q10'
]= [
partner_demographics_2022 'w3_Q4', 'w3_Q9', 'w3_Q6B', 'w3_Q10'
]
# same_sex_couple = same-sex couple
# married = Married
# q34/rel_qual_combo/rel_qual = Relationship quality 1 (Excellent) to 5 (Very Poor) - Potential Target
= [
relationship_status_2017 'w1_same_sex_couple', 'w1_married', 'w1_q34'
]= [
relationship_status_2020 'w2_same_sex_couple', 'w2_married', 'w2_rel_qual_combo'
]= [
relationship_status_2022 'w3_same_sex_couple', 'w3_married', 'w3_rel_qual'
]
# q21b - Year current relationship started
# q21d - Year of marriage
# q21e - Year relationship ended
# relate/relationship duration = Duration of relationship in years
= [
relationship_time_2017 'w1_q21b_year', 'w1_q21d_year', 'w1_q21e_year', 'w1_relate_duration_in2017_years'
]= [
relationship_time_2020 'w2_q21b_year', 'w2_q21d_year', 'w2_q21e_year', 'w2_relationship_duration'
]= [
relationship_time_2022 'w3_Q21B_year', 'w3_Q21D_year', 'w3_Q21E_year', 'w3_relationship_duration_yrs'
]
# PPT01 = # of children in the household ages 0-1
# PPT25 = # of children in the household ages 2-5
# PPT612 = # of children in the household ages 6-12
# PPT1317 = # of children in the household ages 13-17
# PPT18OV = # of children in the household ages 18-Over
= [
children_info_2017 'w1_PPT01', 'w1_PPT25', 'w1_PPT612', 'w1_PPT1317', 'w1_PPT18OV'
]= [
children_info_2020 'w2_PPT01', 'w2_PPT25', 'w2_PPT612', 'w2_PPT1317', 'w2_PPT18OV'
]= [
children_info_2022 'w3_PPT01', 'w3_PPT25', 'w3_PPT612', 'w3_PPT1317', 'w3_PPT18OV'
]
# Only available in 2022 in full (Post-COVID)
# coronavirus_effect_combo = Is relationship better or worse during pandemic
# pandemic_income = has income gone up or down during pandemic
# subject/partner_had_COVID = has been sick with COVID
# corona/partner_vaccine = has been vaccinated
# COVID_agreement = subject and partner agree on approach to pandemic
= [
covid_vars_2022 'w3_coronavirus_effect_combo', 'w3_pandemic_income', 'w3_subject_had_COVID', 'w3_partner_had_COVID', 'w3_corona_vaccine',
'w3_partner_corona_vaccine', 'w3_COVID_agreement'
]
# Only some variables available per year
# sex_frequency = Frequency of sex
# flirt = how often flirt
# fight = how often fight
# monogamy = subject's commitment to monogamy
# p_monogamy = expected commitment to monogamy from partner
= [
relationship_quality_2017 'w1_sex_frequency'
]= [
relationship_quality_2020 'w2_sex_frequency', 'w2_flirt', 'w2_fight'
]= [
relationship_quality_2022 'w3_sex_frequency', 'w3_flirt', 'w3_fight'
]
Rename Columns
Some variables contain the same information but vary in name year to year. So, we’ll need 3 renaming dictionaries (1 per year)
First, the 3 renaming maps (one per year):
= {
column_renames_2017 #subject_demographics
'w1_ppage': 'subject_age',
'w1_ppeduc': 'subject_education',
'w1_ppgender': 'subject_sex',
'w1_ppethm': 'subject_ethnicity',
'w1_ppincimp': 'subject_income_category',
'w1_ppwork': 'subject_employment_status',
#partner_demographics
'w1_q4': 'partner_sex',
'w1_q9': 'partner_age',
'w1_q6b': 'partner_ethnicity',
'w1_q10': 'partner_education',
#relationship_status
'w1_same_sex_couple': 'same_sex_couple',
'w1_married': 'married',
'w1_q34': 'relationship_quality',
#relationship_time
'w1_q21b_year': 'relationship_start_year',
'w1_q21d_year': 'marriage_year',
'w1_q21e_year': 'break_up_year',
'w1_relate_duration_in2017_years': 'relationship_duration',
#children_info
'w1_PPT01': 'kids_0_1',
'w1_PPT25': 'kids_2_5',
'w1_PPT612': 'kids_6_12',
'w1_PPT1317': 'kids_13_17',
'w1_PPT18OV': 'kids_18_plus',
#covidvars
#'w1_coronavirus_effect_combo': 'rel_change_during_pandemic',
#'w1_pandemic_income': 'inc_change_during_pandemic',
#'w1_subject_had_COVID': 'subject_had_covid',
#'w1_partner_had_COVID': 'partner_had_covid',
#'w1_corona_vaccine': 'subject_vaccinated',
#'w1_partner_corona_vaccine': 'partner_vaccinated',
#'w1_COVID_agreement': 'agree_covid_approach',
#relationship_quality
'w1_sex_frequency': 'sex_frequency',
#'w3_flirt': 'flirts_with_partner',
#'w3_fight': 'fights_with_partner',
}
= {
column_renames_2020 #subject_demographics
'w2_ppage': 'subject_age',
'w2_ppeduc': 'subject_education',
'w2_ppgender': 'subject_sex',
'w2_ppethm': 'subject_ethnicity',
'w2_ppincimp': 'subject_income_category',
'w2_ppwork': 'subject_employment_status',
#partner_demographics
'w2_Q4': 'partner_sex',
'w2_Q9': 'partner_age',
'w2_Q6B': 'partner_ethnicity',
'w2_Q10': 'partner_education',
#relationship_status
'w2_same_sex_couple': 'same_sex_couple',
'w2_married': 'married',
'w2_rel_qual_combo': 'relationship_quality',
#relationship_time
'w2_q21b_year': 'relationship_start_year',
'w2_q21d_year': 'marriage_year',
'w2_q21e_year': 'break_up_year',
'w2_relationship_duration': 'relationship_duration',
#children_info
'w2_PPT01': 'kids_0_1',
'w2_PPT25': 'kids_2_5',
'w2_PPT612': 'kids_6_12',
'w2_PPT1317': 'kids_13_17',
'w2_PPT18OV': 'kids_18_plus',
#covidvars
#'w2_coronavirus_effect_combo': 'rel_change_during_pandemic',
#'w2_pandemic_income': 'inc_change_during_pandemic',
#'w2_subject_had_COVID': 'subject_had_covid',
#'w2_partner_had_COVID': 'partner_had_covid',
#'w2_corona_vaccine': 'subject_vaccinated',
#'w2_partner_corona_vaccine': 'partner_vaccinated',
#'w2_COVID_agreement': 'agree_covid_approach',
#relationship_quality
'w2_sex_frequency': 'sex_frequency',
'w2_flirt': 'flirts_with_partner',
'w2_fight': 'fights_with_partner',
}
= {
column_renames_2022 #subject_demographics
'w3_ppage': 'subject_age',
'w3_ppeduc': 'subject_education',
'w3_ppgender': 'subject_sex',
'w3_ppethm': 'subject_ethnicity',
'w3_ppincimp': 'subject_income_category',
'w3_ppwork': 'subject_employment_status',
#partner_demographics
'w3_Q4': 'partner_sex',
'w3_Q9': 'partner_age',
'w3_Q6B': 'partner_ethnicity',
'w3_Q10': 'partner_education',
#relationship_status
'w3_same_sex_couple': 'same_sex_couple',
'w3_married': 'married',
'w3_rel_qual': 'relationship_quality',
#relationship_time
'w3_Q21B_year': 'relationship_start_year',
'w3_Q21D_year': 'marriage_year',
'w3_Q21E_year': 'break_up_year',
'w3_relationship_duration_yrs': 'relationship_duration',
#children_info
'w3_PPT01': 'kids_0_1',
'w3_PPT25': 'kids_2_5',
'w3_PPT612': 'kids_6_12',
'w3_PPT1317': 'kids_13_17',
'w3_PPT18OV': 'kids_18_plus',
#covidvars
'w3_coronavirus_effect_combo': 'rel_change_during_pandemic',
'w3_pandemic_income': 'inc_change_during_pandemic',
'w3_subject_had_COVID': 'subject_had_covid',
'w3_partner_had_COVID': 'partner_had_covid',
'w3_corona_vaccine': 'subject_vaccinated',
'w3_partner_corona_vaccine': 'partner_vaccinated',
'w3_COVID_agreement': 'agree_covid_approach',
#relationship_quality
'w3_sex_frequency': 'sex_frequency',
'w3_flirt': 'flirts_with_partner',
'w3_fight': 'fights_with_partner',
}
Dataframe per year
We need to create dataframes per year while renaming variables in each dictionary.
We will also create a column named “Wave” that contains the year of the survey.
First, 2017:
= df[['caseid_new'] + list(column_renames_2017.keys())].rename(columns=column_renames_2017)
df_2017 'wave'] = '2017'
df_2017[ df_2017.head()
caseid_new | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | kids_0_1 | kids_2_5 | kids_6_12 | kids_13_17 | kids_18_plus | sex_frequency | wave | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 48 | 9 | 2 | 5 | 13 | 1 | 1.0 | 46.0 | 1.0 | 11.0 | 0.0 | 1.0 | 1.0 | 2013.0 | 2014.0 | NaN | 3.583333 | 0 | 0 | 0 | 0 | 1 | 3.0 | 2017 |
1 | 71609 | 68 | 10 | 2 | 1 | 12 | 1 | 1.0 | 71.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1964.0 | 1969.0 | NaN | 52.750000 | 0 | 0 | 0 | 0 | 2 | 5.0 | 2017 |
2 | 106983 | 39 | 11 | 1 | 1 | 15 | 1 | 2.0 | 49.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1999.0 | 2002.0 | NaN | 17.583334 | 0 | 0 | 2 | 0 | 3 | 3.0 | 2017 |
3 | 121759 | 54 | 9 | 1 | 1 | 16 | 1 | 2.0 | 59.0 | 4.0 | 13.0 | 0.0 | 1.0 | 1.0 | 1990.0 | 1991.0 | NaN | 27.416666 | 0 | 0 | 0 | 0 | 4 | 4.0 | 2017 |
4 | 158083 | 48 | 10 | 1 | 1 | 14 | 1 | 2.0 | 34.0 | 1.0 | 11.0 | 0.0 | 0.0 | NaN | 2011.0 | 2013.0 | 2014.0 | NaN | 0 | 0 | 0 | 0 | 1 | NaN | 2017 |
df_2017.shape
(3510, 25)
Then, 2020:
= df[['caseid_new'] + list(column_renames_2020.keys())].rename(columns=column_renames_2020)
df_2020 'wave'] = '2020'
df_2020[ df_2020.head()
caseid_new | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | kids_0_1 | kids_2_5 | kids_6_12 | kids_13_17 | kids_18_plus | sex_frequency | flirts_with_partner | fights_with_partner | wave | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 51.0 | 9.0 | 2.0 | 1.0 | 10.0 | 1.0 | 1.0 | 51.0 | 1.0 | 13.0 | 0.0 | 0.0 | 2.0 | 2020.0 | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 6.0 | 1.0 | 2020 |
1 | 71609 | 71.0 | 10.0 | 2.0 | 1.0 | 13.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | 56.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 | 6.0 | 4.0 | 2020 |
2 | 106983 | 42.0 | 11.0 | 1.0 | 1.0 | 15.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | 21.0 | 0.0 | 0.0 | 1.0 | 1.0 | 3.0 | 3.0 | 2.0 | 3.0 | 2020 |
3 | 121759 | 57.0 | 9.0 | 1.0 | 1.0 | 16.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 1.0 | 2.0 | NaN | NaN | NaN | 31.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 4.0 | 6.0 | 1.0 | 2020 |
4 | 158083 | 52.0 | 10.0 | 1.0 | 1.0 | 18.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | NaN | NaN | NaN | 2020 |
df_2020.shape
(3510, 27)
Then, 2022:
= df[['caseid_new'] + list(column_renames_2022.keys())].rename(columns=column_renames_2022)
df_2022 'wave'] = '2022'
df_2022[ df_2022.head()
caseid_new | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | kids_0_1 | kids_2_5 | kids_6_12 | kids_13_17 | kids_18_plus | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | sex_frequency | flirts_with_partner | fights_with_partner | wave | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 53.0 | 9.0 | 2.0 | 1.0 | 10.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 1.0 | NaN | NaN | NaN | 1.500000 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 1.0 | 3.0 | 0.0 | 1.0 | 4.0 | 4.0 | 1.0 | 3.0 | 2.0 | 1.0 | 2022 |
1 | 71609 | 72.0 | 10.0 | 2.0 | 1.0 | 14.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 1.0 | 2.0 | NaN | NaN | NaN | 57.416668 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2.0 | 2.0 | 0.0 | 0.0 | 1.0 | 1.0 | 2.0 | 5.0 | 6.0 | 8.0 | 2022 |
2 | 106983 | 43.0 | 11.0 | 1.0 | 1.0 | 14.0 | 1.0 | NaN | NaN | NaN | NaN | 0.0 | 1.0 | 1.0 | NaN | NaN | NaN | 22.333334 | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 | 2.0 | 2.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 3.0 | 2.0 | 3.0 | 2022 |
3 | 121759 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2022 |
4 | 158083 | 53.0 | 10.0 | 1.0 | 1.0 | 18.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 3.0 | NaN | 3.0 | 0.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | 2022 |
df_2022.shape
(3510, 34)
A single Dataframe
Now, we can concatenate the 3 dataframes since they have the same variables. We’ll have the “wave” column to differentiate by year.
= pd.concat([df_2017, df_2020, df_2022], ignore_index=True) df_cleaned
df_cleaned.shape
(10530, 34)
df_cleaned.head()
caseid_new | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | kids_0_1 | kids_2_5 | kids_6_12 | kids_13_17 | kids_18_plus | sex_frequency | wave | flirts_with_partner | fights_with_partner | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 48.0 | 9.0 | 2.0 | 5.0 | 13.0 | 1.0 | 1.0 | 46.0 | 1.0 | 11.0 | 0.0 | 1.0 | 1.0 | 2013.0 | 2014.0 | NaN | 3.583333 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 71609 | 68.0 | 10.0 | 2.0 | 1.0 | 12.0 | 1.0 | 1.0 | 71.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1964.0 | 1969.0 | NaN | 52.750000 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 5.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 106983 | 39.0 | 11.0 | 1.0 | 1.0 | 15.0 | 1.0 | 2.0 | 49.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1999.0 | 2002.0 | NaN | 17.583334 | 0.0 | 0.0 | 2.0 | 0.0 | 3.0 | 3.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 121759 | 54.0 | 9.0 | 1.0 | 1.0 | 16.0 | 1.0 | 2.0 | 59.0 | 4.0 | 13.0 | 0.0 | 1.0 | 1.0 | 1990.0 | 1991.0 | NaN | 27.416666 | 0.0 | 0.0 | 0.0 | 0.0 | 4.0 | 4.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 158083 | 48.0 | 10.0 | 1.0 | 1.0 | 14.0 | 1.0 | 2.0 | 34.0 | 1.0 | 11.0 | 0.0 | 0.0 | NaN | 2011.0 | 2013.0 | 2014.0 | NaN | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
We are just going to rename the ‘caseid_new’ to something more simple:
= df_cleaned.rename(columns={'caseid_new': 'id'}) df_cleaned
Selected variables
df_cleaned.columns.tolist()
['id',
'subject_age',
'subject_education',
'subject_sex',
'subject_ethnicity',
'subject_income_category',
'subject_employment_status',
'partner_sex',
'partner_age',
'partner_ethnicity',
'partner_education',
'same_sex_couple',
'married',
'relationship_quality',
'relationship_start_year',
'marriage_year',
'break_up_year',
'relationship_duration',
'kids_0_1',
'kids_2_5',
'kids_6_12',
'kids_13_17',
'kids_18_plus',
'sex_frequency',
'wave',
'flirts_with_partner',
'fights_with_partner',
'rel_change_during_pandemic',
'inc_change_during_pandemic',
'subject_had_covid',
'partner_had_covid',
'subject_vaccinated',
'partner_vaccinated',
'agree_covid_approach']
Merging kids of all ages to a single feature:
'children'] = df_cleaned['kids_0_1'] + df_cleaned['kids_2_5'] + df_cleaned['kids_6_12'] + df_cleaned['kids_13_17'] + df_cleaned['kids_18_plus']
df_cleaned[
=['kids_0_1', 'kids_2_5', 'kids_6_12', 'kids_13_17', 'kids_18_plus'], inplace=True) df_cleaned.drop(columns
df_cleaned.head()
id | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | sex_frequency | wave | flirts_with_partner | fights_with_partner | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | children | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 48.0 | 9.0 | 2.0 | 5.0 | 13.0 | 1.0 | 1.0 | 46.0 | 1.0 | 11.0 | 0.0 | 1.0 | 1.0 | 2013.0 | 2014.0 | NaN | 3.583333 | 3.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
1 | 71609 | 68.0 | 10.0 | 2.0 | 1.0 | 12.0 | 1.0 | 1.0 | 71.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1964.0 | 1969.0 | NaN | 52.750000 | 5.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
2 | 106983 | 39.0 | 11.0 | 1.0 | 1.0 | 15.0 | 1.0 | 2.0 | 49.0 | 1.0 | 10.0 | 0.0 | 1.0 | 1.0 | 1999.0 | 2002.0 | NaN | 17.583334 | 3.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 |
3 | 121759 | 54.0 | 9.0 | 1.0 | 1.0 | 16.0 | 1.0 | 2.0 | 59.0 | 4.0 | 13.0 | 0.0 | 1.0 | 1.0 | 1990.0 | 1991.0 | NaN | 27.416666 | 4.0 | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 4.0 |
4 | 158083 | 48.0 | 10.0 | 1.0 | 1.0 | 14.0 | 1.0 | 2.0 | 34.0 | 1.0 | 11.0 | 0.0 | 0.0 | NaN | 2011.0 | 2013.0 | 2014.0 | NaN | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
Data Type Exploration
We can see what data type each column has. We can note that all columns are encoded with numerical values.
'display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option(
print(df_cleaned.dtypes)
id int64
subject_age float64
subject_education float64
subject_sex float64
subject_ethnicity float64
subject_income_category float64
subject_employment_status float64
partner_sex float64
partner_age float64
partner_ethnicity float64
partner_education float64
same_sex_couple float64
married float64
relationship_quality float64
relationship_start_year float64
marriage_year float64
break_up_year float64
relationship_duration float64
sex_frequency float64
wave object
flirts_with_partner float64
fights_with_partner float64
rel_change_during_pandemic float64
inc_change_during_pandemic float64
subject_had_covid float64
partner_had_covid float64
subject_vaccinated float64
partner_vaccinated float64
agree_covid_approach float64
children float64
dtype: object
Decoding Variables
We’ll need to re-map column content to something more interpretable.
Going column by column…
First, education level:
= {
edu_mapping 1: 'no_education',
2: '1st_4th_grade',
3: '5th_6th_grade',
4: '7th_8th_grade',
5: '9th',
6: '10th',
7: '11th',
8: '12th_nodiploma',
9: 'high_school_grad',
10: 'some_college',
11: 'associate_degree',
12: 'bach_degree',
13: 'masters_degree',
14: 'prof_doct_degree'
}
'subject_education'] = df_cleaned['subject_education'].map(edu_mapping)
df_cleaned['partner_education'] = df_cleaned['partner_education'].map(edu_mapping)
df_cleaned[
= [
ordered_levels 'no_education',
'1st_4th_grade',
'5th_6th_grade',
'7th_8th_grade',
'9th',
'10th',
'11th',
'12th_nodiploma',
'high_school_grad',
'some_college',
'associate_degree',
'bach_degree',
'masters_degree',
'prof_doct_degree'
]
# Convert to ordered categorical
'subject_education'] = pd.Categorical(
df_cleaned['subject_education'],
df_cleaned[=ordered_levels,
categories=True
ordered
)
'partner_education'] = pd.Categorical(
df_cleaned['partner_education'],
df_cleaned[=ordered_levels,
categories=True
ordered )
Then gender:
= {
gender_mapping 1: 'male',
2: 'female',
3: 'other'
}
'subject_sex'] = df_cleaned['subject_sex'].map(gender_mapping)
df_cleaned['partner_sex'] = df_cleaned['partner_sex'].map(gender_mapping) df_cleaned[
Then, ethniticy/race:
= {
eth_sub_mapping 1: 'white',
2: 'black',
3: 'other',
4: 'hispanic',
5: '2_plus_eth'
}
'subject_ethnicity'] = df_cleaned['subject_ethnicity'].map(eth_sub_mapping)
df_cleaned[
= {
eth_part_mapping 1: 'white',
2: 'black',
3: 'american_indian',
4: 'asian',
5: 'other'
}
'partner_ethnicity'] = df_cleaned['partner_ethnicity'].map(eth_part_mapping) df_cleaned[
Then, income level:
= {
income_mapping 1: 'under_5k',
2: '5k_7k',
3: '7k_10k',
4: '10k_12k',
5: '12k_15k',
6: '15k_20k',
7: '20k_25k',
8: '25k_30k',
9: '30k_35k',
10: '35k_40k',
11: '40k_50k',
12: '50k_60k',
13: '60k_75k',
14: '75k_85k',
15: '85k_100k',
16: '100k_125k',
17: '125k_150k',
18: '150k_175k',
19: '175k_200k',
20: '200k_250k',
21: 'over_250k'
}
'subject_income_category'] = df_cleaned['subject_income_category'].map(income_mapping)
df_cleaned[
= [
ordered_income_levels 'under_5k', '5k_7k', '7k_10k', '10k_12k', '12k_15k', '15k_20k',
'20k_25k', '25k_30k', '30k_35k', '35k_40k', '40k_50k', '50k_60k',
'60k_75k', '75k_85k', '85k_100k', '100k_125k', '125k_150k',
'150k_175k', '175k_200k', '200k_250k', 'over_250k'
]
'subject_income_category'] = pd.Categorical(
df_cleaned['subject_income_category'],
df_cleaned[=ordered_income_levels,
categories=True
ordered )
Then, employment status:
= {
employment_mapping 1: 'working_paid_employee',
2: 'working_self_employed',
3: 'not_working_temp_layoff',
4: 'not_working_looking',
5: 'not_working_retired',
6: 'not_working_disabled',
7: 'not_working_other'
}
'subject_employment_status'] = df_cleaned['subject_employment_status'].map(employment_mapping) df_cleaned[
Then, same-sex couple encoding:
= {
same_sex_couple_map 0: 'no',
1: 'yes'
}
'same_sex_couple'] = df_cleaned['same_sex_couple'].map(same_sex_couple_map) df_cleaned[
Then, marital status:
= {
married_mapping 0: 'not_married',
1: 'married'
}
'married'] = df_cleaned['married'].map(married_mapping) df_cleaned[
Then, relationship quality (THIS COULD BE A TARGET VARIABLE FOR MANY STUDIES):
= {
rel_qual_mapping 1: 'excellent',
2: 'good',
3: 'fair',
4: 'poor',
5: 'very_poor'
}
'relationship_quality'] = df_cleaned['relationship_quality'].map(rel_qual_mapping) df_cleaned[
Then, the effect of COVID on the relationship. Note that all COVID variables only appear in the 2022 survey.
= {
covid_effect_mapping 1: 'better_than_before',
2: 'no_change',
3: 'worse_than_before'
}
'rel_change_during_pandemic'] = df_cleaned['rel_change_during_pandemic'].map(covid_effect_mapping) df_cleaned[
Then, the effect of COVID on income level.
= {
covid_income_mapping 1: 'much_worse',
2: 'worse',
3: 'no_change',
4: 'better',
5: 'much_better'
}
'inc_change_during_pandemic'] = df_cleaned['inc_change_during_pandemic'].map(covid_income_mapping) df_cleaned[
Then, if subject or partner had COVID
= {
had_covid_mapping 0: 'no',
1: 'yes'
}
'subject_had_covid'] = df_cleaned['subject_had_covid'].map(had_covid_mapping)
df_cleaned['partner_had_covid'] = df_cleaned['partner_had_covid'].map(had_covid_mapping) df_cleaned[
Then, status of COVID vaccination:
= {
corona_vaccine_mapping 1: 'fully_vaccinated_and_booster',
2: 'fully_vaccinated_no_booster',
3: 'partially_vaccinated',
4: 'not_vaccinated'
}
'subject_vaccinated'] = df_cleaned['subject_vaccinated'].map(corona_vaccine_mapping)
df_cleaned['partner_vaccinated'] = df_cleaned['partner_vaccinated'].map(corona_vaccine_mapping) df_cleaned[
Then, if the couple agrees on COVID approach:
= {
covid_agreement_mapping 1: 'completely_agree',
2: 'mostly_agree',
3: 'mostly_disagree',
4: 'completely_disagree'
}
'agree_covid_approach'] = df_cleaned['agree_covid_approach'].map(covid_agreement_mapping) df_cleaned[
Then, sex frequency:
= {
sex_frequency_mapping 1: 'once_or_more_a_day',
2: '3_to_6_times_a_week',
3: 'once_or_twice_a_week',
4: '2_to_3_times_a_month',
5: 'once_a_month_or_less'
}
'sex_frequency'] = df_cleaned['sex_frequency'].map(sex_frequency_mapping) df_cleaned[
Then, how often the subject flirts with partner:
= {
flirt_mapping 1: 'every_day',
2: 'a_few_times_a_week',
3: 'once_a_week',
4: '1_to_3_times_a_month',
5: 'less_than_once_a_month',
6: 'never'
}
'flirts_with_partner'] = df_cleaned['flirts_with_partner'].map(flirt_mapping)
df_cleaned[
= [
ordered_flirt_frequency_levels 'every_day',
'a_few_times_a_week',
'once_a_week',
'1_to_3_times_a_month',
'less_than_once_a_month',
'never'
]
'flirts_with_partner'] = pd.Categorical(
df_cleaned['flirts_with_partner'],
df_cleaned[=ordered_flirt_frequency_levels,
categories=True
ordered )
Then, how often the subject fights with partner:
= {
fight_mapping 1: '0_times',
2: '1_time',
3: '2_times',
4: '3_times',
5: '4_times',
6: '5_times',
7: '6_times',
8: '7_or_more_times'
}
'fights_with_partner'] = df_cleaned['fights_with_partner'].map(fight_mapping)
df_cleaned[
= [
ordered_fight_frequency_levels '0_times',
'1_time',
'2_times',
'3_times',
'4_times',
'5_times',
'6_times',
'7_or_more_times'
]
'fights_with_partner'] = pd.Categorical(
df_cleaned['fights_with_partner'],
df_cleaned[=ordered_fight_frequency_levels,
categories=True
ordered )
Now, we can see the updated data types for each variable.
We can also see a sample of how the data looks after the recoding process.
'display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option(
print(df_cleaned.dtypes)
id int64
subject_age float64
subject_education category
subject_sex object
subject_ethnicity object
subject_income_category category
subject_employment_status object
partner_sex object
partner_age float64
partner_ethnicity object
partner_education category
same_sex_couple object
married object
relationship_quality object
relationship_start_year float64
marriage_year float64
break_up_year float64
relationship_duration float64
sex_frequency object
wave object
flirts_with_partner category
fights_with_partner category
rel_change_during_pandemic object
inc_change_during_pandemic object
subject_had_covid object
partner_had_covid object
subject_vaccinated object
partner_vaccinated object
agree_covid_approach object
children float64
dtype: object
20) df_cleaned.sample(
id | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | relationship_quality | relationship_start_year | marriage_year | break_up_year | relationship_duration | sex_frequency | wave | flirts_with_partner | fights_with_partner | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | children | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5795 | 2586225 | 57.0 | masters_degree | female | white | 100k_125k | working_paid_employee | male | 58.0 | white | prof_doct_degree | no | not_married | excellent | 2019.0 | NaN | NaN | 1.000000 | NaN | 2020 | a_few_times_a_week | 0_times | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
5371 | 2330205 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4116 | 1570547 | 61.0 | some_college | male | white | 100k_125k | working_paid_employee | NaN | NaN | NaN | NaN | no | married | excellent | NaN | NaN | NaN | 38.000000 | once_or_twice_a_week | 2020 | less_than_once_a_month | 0_times | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
1270 | 2011087 | 29.0 | high_school_grad | male | white | 7k_10k | working_paid_employee | female | 23.0 | white | high_school_grad | no | not_married | NaN | 2016.0 | NaN | 2017.0 | NaN | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
7626 | 1570547 | 62.0 | some_college | male | white | 100k_125k | working_paid_employee | NaN | NaN | NaN | NaN | no | married | excellent | NaN | NaN | NaN | 40.166668 | 2_to_3_times_a_month | 2022 | less_than_once_a_month | 0_times | no_change | no_change | no | no | fully_vaccinated_and_booster | fully_vaccinated_and_booster | completely_agree | 2.0 |
3719 | 924575 | 56.0 | prof_doct_degree | female | other | 125k_150k | working_paid_employee | NaN | NaN | NaN | NaN | no | not_married | good | NaN | NaN | NaN | 19.000000 | NaN | 2020 | 1_to_3_times_a_month | 0_times | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
2076 | 2449593 | 71.0 | associate_degree | male | white | over_250k | working_self_employed | female | 68.0 | white | prof_doct_degree | no | married | fair | 1972.0 | 1973.0 | NaN | 44.750000 | once_a_month_or_less | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
6036 | 2699919 | 42.0 | high_school_grad | female | hispanic | 100k_125k | not_working_looking | NaN | NaN | NaN | NaN | no | married | excellent | NaN | NaN | NaN | 25.000000 | 3_to_6_times_a_week | 2020 | a_few_times_a_week | 0_times | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 |
1634 | 2219759 | 23.0 | some_college | female | white | 125k_150k | working_paid_employee | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
9503 | 2682019 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2022 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1958 | 2401785 | 41.0 | bach_degree | male | hispanic | 25k_30k | working_paid_employee | male | 48.0 | white | some_college | yes | married | good | 2012.0 | 2016.0 | NaN | 4.750000 | 2_to_3_times_a_month | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
1612 | 2212021 | 50.0 | high_school_grad | female | white | 75k_85k | working_paid_employee | male | 52.0 | white | high_school_grad | no | married | excellent | 1983.0 | 1985.0 | NaN | 34.500000 | once_or_twice_a_week | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 |
6392 | 2844081 | 30.0 | associate_degree | male | white | 35k_40k | working_paid_employee | NaN | NaN | NaN | NaN | NaN | not_married | NaN | NaN | NaN | NaN | NaN | NaN | 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
1092 | 1928049 | 54.0 | associate_degree | female | hispanic | 75k_85k | working_paid_employee | male | 48.0 | other | some_college | no | not_married | excellent | 2002.0 | NaN | NaN | 15.500000 | once_a_month_or_less | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
6791 | 2932429 | 35.0 | bach_degree | male | white | 125k_150k | working_paid_employee | NaN | NaN | NaN | NaN | no | married | good | NaN | NaN | NaN | 14.000000 | once_a_month_or_less | 2020 | never | 1_time | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 |
4025 | 1420887 | 78.0 | bach_degree | female | white | 175k_200k | not_working_other | NaN | NaN | NaN | NaN | no | married | excellent | NaN | NaN | NaN | 55.000000 | once_a_month_or_less | 2020 | never | 0_times | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
1023 | 1905539 | 57.0 | high_school_grad | male | white | 100k_125k | working_self_employed | female | 54.0 | white | high_school_grad | no | not_married | NaN | 1986.0 | 1988.0 | 2006.0 | NaN | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 |
5290 | 2296263 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2020 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9679 | 2765923 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2022 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
464 | 1359305 | 63.0 | high_school_grad | male | white | 150k_175k | working_self_employed | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2017 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 |
NaN Percentage
Below we can note the % of NaN values in each column.
We can note that most columns still has a major % of NaN values.
These high percentages are mostly dependent on the year of the survey.
For example:
- 2017 is missing primarily COVID related info.
- 2020 is missing primarily partner info, chronological info and COVID related info.
- 2022 is missing primarily partner info and chronological info.
This makes sense as the documentation notes that:
- Variables in survey changed from year to year slightly.
- “Response rate was 3510/6753=52% in 2017, 2107/2431= 87% in 2020, and 1722/2073=83% in 2022. The Denominators in 2020 and 2022 include only subjects who remained in the KnowledgePanel, as they were the only subjects eligible to be contacted.”
df_cleaned.shape
(10530, 30)
= df_cleaned.isna().mean() * 100
nan_percentage_per_column print(nan_percentage_per_column)
id 0.000000
subject_age 30.303894
subject_education 30.303894
subject_sex 30.303894
subject_ethnicity 30.303894
subject_income_category 30.303894
subject_employment_status 30.303894
partner_sex 65.536562
partner_age 65.707502
partner_ethnicity 65.660019
partner_education 65.622032
same_sex_couple 39.563153
married 31.405508
relationship_quality 44.586895
relationship_start_year 66.666667
marriage_year 79.116809
break_up_year 96.077873
relationship_duration 46.087369
sex_frequency 47.853751
wave 0.000000
flirts_with_partner 70.940171
fights_with_partner 71.975309
rel_change_during_pandemic 87.388414
inc_change_during_pandemic 83.741690
subject_had_covid 83.732194
partner_had_covid 86.904084
subject_vaccinated 83.817664
partner_vaccinated 87.245964
agree_covid_approach 87.435897
children 30.303894
dtype: float64
= (
nan_percentage_per_column_by_wave
df_cleaned'wave')
.groupby(apply(lambda g: g.isna().mean() * 100)
.
.transpose()
)
print(nan_percentage_per_column_by_wave)
wave 2017 2020 2022
id 0.000000 0.000000 0.000000
subject_age 0.000000 39.971510 50.940171
subject_education 0.000000 39.971510 50.940171
subject_sex 0.000000 39.971510 50.940171
subject_ethnicity 0.000000 39.971510 50.940171
subject_income_category 0.000000 39.971510 50.940171
subject_employment_status 0.000000 39.971510 50.940171
partner_sex 3.390313 95.868946 97.350427
partner_age 3.874644 95.868946 97.378917
partner_ethnicity 3.732194 95.897436 97.350427
partner_education 3.618234 95.897436 97.350427
same_sex_couple 3.304843 53.475783 61.908832
married 3.304843 39.971510 50.940171
relationship_quality 18.888889 52.962963 61.908832
relationship_start_year 6.666667 95.925926 97.407407
marriage_year 38.205128 99.658120 99.487179
break_up_year 88.319088 99.943020 99.971510
relationship_duration 21.225071 54.045584 62.991453
sex_frequency 23.475783 55.897436 64.188034
wave 0.000000 0.000000 0.000000
flirts_with_partner 100.000000 51.225071 61.595442
fights_with_partner 100.000000 53.589744 62.336182
rel_change_during_pandemic 100.000000 100.000000 62.165242
inc_change_during_pandemic 100.000000 100.000000 51.225071
subject_had_covid 100.000000 100.000000 51.196581
partner_had_covid 100.000000 100.000000 60.712251
subject_vaccinated 100.000000 100.000000 51.452991
partner_vaccinated 100.000000 100.000000 61.737892
agree_covid_approach 100.000000 100.000000 62.307692
children 0.000000 39.971510 50.940171
C:\Users\Paco\AppData\Local\Temp\ipykernel_21604\437808906.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda g: g.isna().mean() * 100)
We can remove some of the rows that are fully empty (even if they have values in “id” and “wave”)
= df_cleaned.columns.difference(['id', 'wave'])
cols_to_check = df_cleaned.dropna(subset=cols_to_check, how='all') df_cleaned
df_cleaned.shape
(7339, 30)
= (
nan_percentage_per_column_by_wave
df_cleaned'wave')
.groupby(apply(lambda g: g.isna().mean() * 100)
.
.transpose()
)
print(nan_percentage_per_column_by_wave)
wave 2017 2020 2022
id 0.000000 0.000000 0.000000
subject_age 0.000000 0.000000 0.000000
subject_education 0.000000 0.000000 0.000000
subject_sex 0.000000 0.000000 0.000000
subject_ethnicity 0.000000 0.000000 0.000000
subject_income_category 0.000000 0.000000 0.000000
subject_employment_status 0.000000 0.000000 0.000000
partner_sex 3.390313 93.118178 94.599303
partner_age 3.874644 93.118178 94.657375
partner_ethnicity 3.732194 93.165638 94.599303
partner_education 3.618234 93.165638 94.599303
same_sex_couple 3.304843 22.496440 22.357724
married 3.304843 0.000000 0.000000
relationship_quality 18.888889 21.642145 22.357724
relationship_start_year 6.666667 93.213099 94.715447
marriage_year 38.205128 99.430470 98.954704
break_up_year 88.319088 99.905078 99.941928
relationship_duration 21.225071 23.445657 24.564460
sex_frequency 23.475783 26.530612 27.003484
wave 0.000000 0.000000 0.000000
flirts_with_partner 100.000000 18.747034 21.718931
fights_with_partner 100.000000 22.686284 23.228804
rel_change_during_pandemic 100.000000 100.000000 22.880372
inc_change_during_pandemic 100.000000 100.000000 0.580720
subject_had_covid 100.000000 100.000000 0.522648
partner_had_covid 100.000000 100.000000 19.918699
subject_vaccinated 100.000000 100.000000 1.045296
partner_vaccinated 100.000000 100.000000 22.009292
agree_covid_approach 100.000000 100.000000 23.170732
children 0.000000 0.000000 0.000000
C:\Users\Paco\AppData\Local\Temp\ipykernel_21604\437808906.py:4: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
.apply(lambda g: g.isna().mean() * 100)
Final Reorganization of Columns
Giving amount of missing data, “Partner variables” and “Chronological variables” that have significant %NaN rates in 2020 and 2022 could be removed for specific inquiries. However, I would propose to keep them in the overall dataset.
Therefore, the final data set would be:
df_cleaned.columns
Index(['id', 'subject_age', 'subject_education', 'subject_sex',
'subject_ethnicity', 'subject_income_category',
'subject_employment_status', 'partner_sex', 'partner_age',
'partner_ethnicity', 'partner_education', 'same_sex_couple', 'married',
'relationship_quality', 'relationship_start_year', 'marriage_year',
'break_up_year', 'relationship_duration', 'sex_frequency', 'wave',
'flirts_with_partner', 'fights_with_partner',
'rel_change_during_pandemic', 'inc_change_during_pandemic',
'subject_had_covid', 'partner_had_covid', 'subject_vaccinated',
'partner_vaccinated', 'agree_covid_approach', 'children'],
dtype='object')
= df_cleaned[['id', 'wave', #Identifiers
data
'subject_age', 'subject_education', 'subject_sex', #Subject variables
'subject_ethnicity', 'subject_income_category', 'subject_employment_status',
'partner_sex', 'partner_age', 'partner_ethnicity', 'partner_education', #Partner variables
'same_sex_couple', 'married', 'sex_frequency', 'flirts_with_partner', 'fights_with_partner', #Couple Habits
'relationship_start_year', 'marriage_year', 'break_up_year', #Chronology
'relationship_duration',
'children', #Kids Info
'rel_change_during_pandemic', 'inc_change_during_pandemic', #Pandemic Vars
'subject_had_covid', 'partner_had_covid', 'subject_vaccinated',
'partner_vaccinated', 'agree_covid_approach',
'relationship_quality' #Outcome
]]
data.head()
id | wave | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | sex_frequency | flirts_with_partner | fights_with_partner | relationship_start_year | marriage_year | break_up_year | relationship_duration | children | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | relationship_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 2017 | 48.0 | high_school_grad | female | 2_plus_eth | 60k_75k | working_paid_employee | male | 46.0 | white | associate_degree | no | married | once_or_twice_a_week | NaN | NaN | 2013.0 | 2014.0 | NaN | 3.583333 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
1 | 71609 | 2017 | 68.0 | some_college | female | white | 50k_60k | working_paid_employee | male | 71.0 | white | some_college | no | married | once_a_month_or_less | NaN | NaN | 1964.0 | 1969.0 | NaN | 52.750000 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
2 | 106983 | 2017 | 39.0 | associate_degree | male | white | 85k_100k | working_paid_employee | female | 49.0 | white | some_college | no | married | once_or_twice_a_week | NaN | NaN | 1999.0 | 2002.0 | NaN | 17.583334 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
3 | 121759 | 2017 | 54.0 | high_school_grad | male | white | 100k_125k | working_paid_employee | female | 59.0 | asian | masters_degree | no | married | 2_to_3_times_a_month | NaN | NaN | 1990.0 | 1991.0 | NaN | 27.416666 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
4 | 158083 | 2017 | 48.0 | some_college | male | white | 75k_85k | working_paid_employee | female | 34.0 | white | associate_degree | no | not_married | NaN | NaN | NaN | 2011.0 | 2013.0 | 2014.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Final Data Review
for column in df_cleaned.columns:
print(f"Column: {column}")
print(f"Data type: {df_cleaned[column].dtype}")
print(f"Unique values: {df_cleaned[column].unique()}\n")
Column: id
Data type: int64
Unique values: [ 53001 71609 106983 ... 2968971 2969933 2972135]
Column: subject_age
Data type: float64
Unique values: [48. 68. 39. 54. 59. 72. 55. 73. 46. 43. 57. 50. 61. 79. 58. 64. 81. 70.
80. 53. 51. 74. 56. 40. 36. 22. 47. 78. 67. 25. 65. 38. 24. 66. 35. 26.
60. 71. 27. 29. 34. 76. 21. 41. 28. 19. 49. 86. 20. 23. 44. 84. 62. 63.
45. 52. 77. 75. 42. 82. 69. 92. 85. 32. 37. 33. 30. 31. 90. 83. 18. 87.
93. 89. 91. 88. 95. 97. 98.]
Column: subject_education
Data type: category
Unique values: ['high_school_grad', 'some_college', 'associate_degree', 'bach_degree', 'masters_degree', ..., '11th', '7th_8th_grade', '1st_4th_grade', 'no_education', '5th_6th_grade']
Length: 14
Categories (14, object): ['no_education' < '1st_4th_grade' < '5th_6th_grade' < '7th_8th_grade' ... 'associate_degree' < 'bach_degree' < 'masters_degree' < 'prof_doct_degree']
Column: subject_sex
Data type: object
Unique values: ['female' 'male']
Column: subject_ethnicity
Data type: object
Unique values: ['2_plus_eth' 'white' 'black' 'other' 'hispanic']
Column: subject_income_category
Data type: category
Unique values: ['60k_75k', '50k_60k', '85k_100k', '100k_125k', '75k_85k', ..., '15k_20k', '20k_25k', 'under_5k', '7k_10k', '5k_7k']
Length: 21
Categories (21, object): ['under_5k' < '5k_7k' < '7k_10k' < '10k_12k' ... '150k_175k' < '175k_200k' < '200k_250k' < 'over_250k']
Column: subject_employment_status
Data type: object
Unique values: ['working_paid_employee' 'not_working_disabled' 'not_working_retired'
'not_working_looking' 'working_self_employed' 'not_working_other'
'not_working_temp_layoff']
Column: partner_sex
Data type: object
Unique values: ['male' 'female' nan 'other']
Column: partner_age
Data type: float64
Unique values: [46. 71. 49. 59. 34. 52. 75. 55. 79. 37. 63. 51. 61. 62. 74. 50. 65. 58.
83. 47. 70. 77. 42. 69. 24. 53. 56. 54. 57. 32. 44. 72. 28. 64. 30. nan
33. 60. 43. 25. 27. 31. 80. 21. 78. 67. 29. 40. 39. 76. 23. 41. 45. 82.
26. 66. 68. 86. 36. 84. 18. 20. 38. 73. 19. 81. 22. 91. 85. 48. -1. 87.
89. 35. 10. 90. 94. 95. 17. 14. 16. 11.]
Column: partner_ethnicity
Data type: object
Unique values: ['white' 'asian' 'black' 'american_indian' 'other' nan]
Column: partner_education
Data type: category
Unique values: ['associate_degree', 'some_college', 'masters_degree', 'bach_degree', 'high_school_grad', ..., '7th_8th_grade', '10th', 'no_education', '5th_6th_grade', '1st_4th_grade']
Length: 15
Categories (14, object): ['no_education' < '1st_4th_grade' < '5th_6th_grade' < '7th_8th_grade' ... 'associate_degree' < 'bach_degree' < 'masters_degree' < 'prof_doct_degree']
Column: same_sex_couple
Data type: object
Unique values: ['no' nan 'yes']
Column: married
Data type: object
Unique values: ['married' 'not_married' nan]
Column: relationship_quality
Data type: object
Unique values: ['excellent' nan 'good' 'fair' 'very_poor' 'poor']
Column: relationship_start_year
Data type: float64
Unique values: [2013. 1964. 1999. 1990. 2011. 1993. 1981. 1983. 1996. 2001. 2014. 1984.
1987. 1988. 1962. 1989. 1954. 1970. 2007. nan 1960. 2015. 1978. 2006.
2016. 1991. 1986. 1997. 2010. 2017. 1968. 1998. 1980. 2004. 1982. 1972.
2002. 1977. 2008. 1974. 1985. 1994. 1979. 2009. 1959. 1965. 2012. 1971.
1976. 1961. 1958. 1995. 1969. 1953. 1973. 2003. 1992. 2000. 1967. 1966.
1963. 2005. 1975. 1955. 1952. 1947. 1957. 1951. 1950. 1956. 1948. 1949.
1942. 2020. 2019. 2018. 1920. 2021. 2022.]
Column: marriage_year
Data type: float64
Unique values: [2014. 1969. 2002. 1991. 2013. 1997. nan 1983. 2017. 2001. 1990. 2007.
1964. 1985. 1984. 1956. 1995. 1972. 2008. 1960. 1986. 2000. 2009. 1987.
1965. 1978. 1970. 2011. 1973. 2006. 1975. 1981. 1977. 1963. 1988. 1980.
1976. 1999. 1992. 1996. 1961. 1974. 1994. 1955. 2016. 2003. 1982. 1998.
1971. 1989. 1979. 1966. 2005. 1962. 1967. 1968. 2010. 2015. 2004. 1993.
1959. 2012. 1951. 1952. 1957. 1958. 1950. 1944. 1948. 1953. 2019. 2018.
2020. 2022. 2021.]
Column: break_up_year
Data type: float64
Unique values: [ nan 2014. 1989. 1985. 1984. 2016. 2017. 2012. 2013. 2015. 2007. 2006.
2003. 2009. 1996. 2000. 1999. 2010. 1987. 1990. 1976. 2001. 1994. 1997.
1978. 2004. 1992. 1975. 2011. 2002. 2008. 1986. 1993. 1980. 1995. 1988.
1998. 1977. 2005. 1983. 1991. 2018. 2020. 2022.]
Column: relationship_duration
Data type: float64
Unique values: [ 3.58333325e+00 5.27500000e+01 1.75833340e+01 2.74166660e+01
nan 2.35833340e+01 3.42500000e+01 2.36666660e+01
2.10000000e+01 1.64166660e+01 3.41666675e+00 3.34166679e+01
2.98333340e+01 2.89166660e+01 2.67500000e+01 2.80000000e+01
3.59166679e+01 6.29166679e+01 2.45000000e+01 2.44166660e+01
4.70833321e+01 1.03333330e+01 1.77500000e+01 1.66666663e+00
3.89166679e+01 1.80833340e+01 1.09166670e+01 3.58333321e+01
2.40833340e+01 2.88333340e+01 2.80833340e+01 2.65000000e+01
3.43333321e+01 3.11666660e+01 2.75000000e+00 5.83333349e+00
5.66666651e+00 1.95833340e+01 5.26666679e+01 3.95000000e+01
4.90833321e+01 6.41666651e+00 1.91666660e+01 3.65833321e+01
1.33333330e+01 3.47500000e+01 4.53333321e+01 1.46666670e+01
3.99166679e+01 9.16666698e+00 4.25833321e+01 3.18333340e+01
2.30000000e+01 3.81666679e+01 4.02500000e+01 1.15000000e+01
3.90833321e+01 4.93333321e+01 8.16666698e+00 3.02500000e+01
5.78333321e+01 5.15833321e+01 5.16666651e+00 3.10833340e+01
3.67500000e+01 3.88333321e+01 4.56666679e+01 3.30000000e+01
3.32500000e+01 4.61666679e+01 2.78333340e+01 1.33333337e+00
1.97500000e+01 3.80833321e+01 3.16666660e+01 3.97500000e+01
2.02500000e+01 5.95000000e+01 2.24166660e+01 3.40833321e+01
4.10000000e+01 6.66666687e-01 4.13333321e+01 4.12500000e+01
1.00000000e+01 2.08333325e+00 7.83333349e+00 3.22500000e+01
2.03333340e+01 9.41666698e+00 5.65833321e+01 3.75000000e+00
1.00833330e+01 2.58333325e+00 4.31666679e+01 4.87500000e+01
3.66666675e+00 2.77500000e+01 4.79166679e+01 2.69166660e+01
6.37500000e+01 2.16666675e+00 4.58333349e+00 3.51666679e+01
4.40000000e+01 1.26666670e+01 1.50000000e+01 2.20833340e+01
1.41666670e+01 3.52500000e+01 5.48333321e+01 2.30833340e+01
4.50833321e+01 3.72500000e+01 3.24166679e+01 5.28333321e+01
6.50000000e+00 2.96666660e+01 2.71666660e+01 5.90833321e+01
4.05000000e+01 1.93333340e+01 2.49166660e+01 1.65833340e+01
2.91666675e+00 5.19166679e+01 1.66666672e-01 1.40000000e+01
2.63333340e+01 4.98333321e+01 5.07500000e+01 2.90833340e+01
2.50000000e-01 2.56666660e+01 2.68333340e+01 4.26666679e+01
4.41666651e+00 1.04166670e+01 5.37500000e+01 0.00000000e+00
3.55833321e+01 4.91666651e+00 3.04166660e+01 1.78333340e+01
3.00833340e+01 5.46666679e+01 9.58333302e+00 2.12500000e+01
4.94166679e+01 3.57500000e+01 1.91666663e+00 2.94166660e+01
1.89166660e+01 1.54166670e+01 1.05833330e+01 1.94166660e+01
3.30833321e+01 1.25000000e+01 1.61666660e+01 4.24166679e+01
1.68333340e+01 4.60833321e+01 8.50000000e+00 3.91666675e+00
4.71666679e+01 5.00000000e-01 4.28333321e+01 4.15000000e+01
2.29166660e+01 3.84166679e+01 2.19166660e+01 1.57500000e+01
6.83333349e+00 3.96666679e+01 1.55000000e+01 6.16666679e+01
2.70833340e+01 4.16666679e+01 5.21666679e+01 2.00833340e+01
3.50000000e+00 2.82500000e+01 1.36666670e+01 6.50833359e+01
2.87500000e+01 2.52500000e+01 5.13333321e+01 5.25000000e+01
2.51666660e+01 3.68333321e+01 2.91666660e+01 3.54166679e+01
4.75833321e+01 2.06666660e+01 5.72500000e+01 3.41666679e+01
1.10000000e+01 4.92500000e+01 5.10833321e+01 3.20000000e+01
2.79166660e+01 2.33333340e+01 2.64166660e+01 2.76666660e+01
4.46666679e+01 5.66666679e+01 3.20833321e+01 5.55833321e+01
5.79166679e+01 2.48333340e+01 5.30000000e+01 5.55000000e+01
1.58333337e+00 1.08333337e+00 3.39166679e+01 1.24166670e+01
7.91666651e+00 9.00000000e+00 4.00000000e+01 5.08333321e+01
2.00000000e+01 3.35000000e+01 5.36666679e+01 4.33333349e+00
3.37500000e+01 1.58333330e+01 4.39166679e+01 4.83333321e+01
4.60000000e+01 8.33333358e-02 5.00000000e+01 4.10833321e+01
2.04166660e+01 2.99166660e+01 2.50000000e+00 4.50000000e+01
5.00000000e+00 1.20000000e+01 1.90833340e+01 1.88333340e+01
7.50000000e+00 2.38333340e+01 1.62500000e+01 8.91666698e+00
8.41666698e+00 7.00000000e+01 2.33333325e+00 2.14166660e+01
4.85000000e+01 1.80000000e+01 5.25833321e+01 4.89166679e+01
2.55000000e+01 2.09166660e+01 6.00000000e+00 2.34166660e+01
2.83333340e+01 4.01666679e+01 5.99166679e+01 5.60000000e+01
3.77500000e+01 2.57500000e+01 3.25000000e+00 9.25000000e+00
3.63333321e+01 1.59166670e+01 1.32500000e+01 1.25833330e+01
1.85833340e+01 7.50000000e-01 4.38333321e+01 3.86666679e+01
3.66666679e+01 5.83333313e-01 2.92500000e+01 2.16666660e+01
6.60000000e+01 5.50833321e+01 1.16666663e+00 5.20000000e+01
5.44166679e+01 3.65000000e+01 3.23333321e+01 8.83333302e+00
4.04166679e+01 4.69166679e+01 2.17500000e+01 3.21666679e+01
5.03333321e+01 1.39166670e+01 4.57500000e+01 1.75000000e+01
7.75000000e+00 2.73333340e+01 5.58333321e+01 2.46666660e+01
2.62500000e+01 1.30000000e+01 9.91666698e+00 2.28333340e+01
1.83333337e+00 2.58333340e+01 2.15833340e+01 2.95833340e+01
2.26666660e+01 3.76666679e+01 4.45833321e+01 9.75000000e+00
3.25833321e+01 5.11666679e+01 3.17500000e+01 2.01666660e+01
2.75833340e+01 5.47500000e+01 1.87500000e+01 4.80833321e+01
4.97500000e+01 4.90000000e+01 3.45000000e+01 5.91666651e+00
3.13333340e+01 3.64166679e+01 3.70000000e+01 5.09166679e+01
4.81666679e+01 6.50000000e+01 4.03333321e+01 3.93333321e+01
4.76666679e+01 4.08333349e+00 4.48333321e+01 1.35000000e+01
4.07500000e+01 4.16666651e+00 3.75833321e+01 4.23333321e+01
5.50000000e+00 4.64166679e+01 1.11666670e+01 1.41666663e+00
2.50833340e+01 1.44166670e+01 2.11666660e+01 5.49166679e+01
4.20833321e+01 2.60000000e+01 4.30833321e+01 1.83333340e+01
3.28333321e+01 1.69166660e+01 1.82500000e+01 3.90000000e+01
3.50000000e+01 1.63333340e+01 1.74166660e+01 1.23333330e+01
1.37500000e+01 5.41666651e+00 2.23333340e+01 5.70833321e+01
8.33333302e+00 3.55000000e+01 4.59166679e+01 2.95000000e+01
3.53333321e+01 3.74166679e+01 5.39166679e+01 6.30000000e+01
6.91666651e+00 1.19166670e+01 8.66666698e+00 4.34166679e+01
2.41666675e+00 6.58333349e+00 4.83333349e+00 5.69166679e+01
8.00000000e+00 1.47500000e+01 2.22500000e+01 3.61666679e+01
4.00833321e+01 1.52500000e+01 3.19166660e+01 3.06666660e+01
5.24166679e+01 5.77500000e+01 3.08333340e+01 5.42500000e+01
4.30000000e+01 5.60833321e+01 1.72500000e+01 3.08333325e+00
2.47500000e+01 4.88333321e+01 4.18333321e+01 2.90000000e+01
3.91666679e+01 3.25000000e+01 1.13333330e+01 3.38333321e+01
2.42500000e+01 4.17500000e+01 2.05000000e+01 4.49166679e+01
4.73333321e+01 3.83333325e+00 3.83333321e+01 1.45000000e+01
6.75000000e+00 4.55833321e+01 4.95833321e+01 2.75000000e+01
2.70000000e+01 1.05000000e+01 8.33333313e-01 2.05833340e+01
3.00000000e+00 9.66666698e+00 1.76666660e+01 4.84166679e+01
2.53333340e+01 1.67500000e+01 6.33333349e+00 3.50833321e+01
2.81666660e+01 1.18333330e+01 2.25000000e+01 5.02500000e+01
4.35833321e+01 5.06666679e+01 3.33333343e-01 1.60833340e+01
4.70000000e+01 6.25000000e+00 1.21666670e+01 4.82500000e+01
1.16666670e+01 9.83333302e+00 3.85000000e+01 5.63333321e+01
1.50000000e+00 4.75000000e+01 3.95833321e+01 6.58333359e+01
-3.33333343e-01 2.41666660e+01 1.50833330e+01 8.58333302e+00
6.67500000e+01 4.50000000e+00 5.58333349e+00 1.84166660e+01
2.08333340e+01 4.06666679e+01 5.53333321e+01 7.41666651e+00
4.75000000e+00 2.86666660e+01 2.15000000e+01 1.45833330e+01
5.17500000e+01 2.59166660e+01 1.55833330e+01 4.36666679e+01
1.25000000e+00 1.06666670e+01 4.19166679e+01 1.73333340e+01
1.10833330e+01 5.91666679e+01 1.70833340e+01 3.40000000e+01
1.66666660e+01 2.43333340e+01 4.20000000e+01 1.70000000e+01
7.16666651e+00 5.57500000e+01 2.40000000e+01 6.08333349e+00
1.92500000e+01 4.08333321e+01 1.01666670e+01 2.93333340e+01
2.66666660e+01 5.75000000e+00 6.26666679e+01 4.22500000e+01
1.95000000e+01 1.90000000e+01 3.60000000e+01 4.68333321e+01
1.29166670e+01 1.75000000e+00 1.27500000e+01 9.16666687e-01
6.21666679e+01 4.54166679e+01 9.08333302e+00 8.08333302e+00
5.40833321e+01 4.78333321e+01 7.08333349e+00 4.11666679e+01
3.15000000e+01 7.25000000e+00 4.14166679e+01 1.99166660e+01
6.08333321e+01 2.25000000e+00 1.30833330e+01 1.56666670e+01
1.15833330e+01 8.25000000e+00 7.00000000e+00 4.00000000e+00
3.33333325e+00 4.65833321e+01 6.16666651e+00 1.02500000e+01
6.00000000e+01 4.33333321e+01 4.65000000e+01 2.60833340e+01
1.31666670e+01 1.17500000e+01 2.55833340e+01 4.42500000e+01
1.51666670e+01 3.09166660e+01 2.65833340e+01 1.48333330e+01
5.38333321e+01 9.33333302e+00 2.66666675e+00 2.25833340e+01
6.28333321e+01 4.72500000e+01 2.50000000e+01 1.42500000e+01
1.38333330e+01 4.16666657e-01 2.13333340e+01 2.37500000e+01
3.98333321e+01 5.22500000e+01 1.53333330e+01 3.85833321e+01
3.00000000e+01 7.58333349e+00 4.25000000e+00 3.78333321e+01
3.87500000e+01 5.50000000e+01 4.85833321e+01 2.21666660e+01
6.47500000e+01 8.75000000e+00 3.16666675e+00 1.20833330e+01
6.66666651e+00 3.75000000e+01 1.43333330e+01 3.56666679e+01
1.40833330e+01 3.82500000e+01 4.58333321e+01 1.35833330e+01
5.33333349e+00 4.66666651e+00 1.22500000e+01 5.20833321e+01
1.08333330e+01 3.71666679e+01 2.83333325e+00 4.41666679e+01
6.90833359e+01 3.92500000e+01 3.80000000e+01 4.21666679e+01
3.46666679e+01 5.76666679e+01 1.07500000e+01 5.25000000e+00
6.07500000e+01 4.51666679e+01 5.10000000e+01 2.00000000e+00
1.28333330e+01 1.34166670e+01 6.40833359e+01 5.68333321e+01
2.07500000e+01 3.49166679e+01 4.09166679e+01 5.45000000e+01
5.35833321e+01 1.60000000e+01 3.60833321e+01 6.80000000e+01
2.61666660e+01 5.54166679e+01 4.77500000e+01 1.00000000e+00
3.33333321e+01 1.12500000e+01 6.00833321e+01 1.86666660e+01
1.79166660e+01 3.03333340e+01 3.73333321e+01 5.43333321e+01
5.15000000e+01 3.05833340e+01 4.55000000e+01 1.49166670e+01
4.15833321e+01 3.10000000e+01 3.05000000e+01 6.06666679e+01
3.48333321e+01 3.94166679e+01 4.47500000e+01 2.72500000e+01
3.15833340e+01 3.44166679e+01 2.84166660e+01 4.67500000e+01
2.35000000e+01 3.14166660e+01 4.37500000e+01 3.69166679e+01
2.45833340e+01 5.01666679e+01 6.12500000e+01 1.14166670e+01
2.20000000e+01 7.51666641e+01 6.45833359e+01 7.33333349e+00
6.69166641e+01 3.29166679e+01 5.12500000e+01 1.71666660e+01
5.45833321e+01 2.27500000e+01 5.62500000e+01 5.90000000e+01
5.08333349e+00 4.63333321e+01 2.32500000e+01 6.11666679e+01
4.74166679e+01 2.10833340e+01 9.50000000e+00 5.30833321e+01
1.81666660e+01 2.85833340e+01 3.36666679e+01 4.25000000e+01
4.66666679e+01 3.70833321e+01 6.73333359e+01 4.05833321e+01
5.65000000e+01 1.96666660e+01 2.18333340e+01 2.85000000e+01
4.44166679e+01 3.12500000e+01 4.45000000e+01 6.23333321e+01
4.40833321e+01 5.00833321e+01 6.10833321e+01 1.98333340e+01
4.99166679e+01 5.34166679e+01 2.97500000e+01 5.35000000e+01
5.88333321e+01 6.19166679e+01 4.29166679e+01 5.95833321e+01
5.40000000e+01 5.05000000e+01 1.65000000e+01 2.54166660e+01
5.86666679e+01 3.79166679e+01 3.45833321e+01 7.66666651e+00
2.31666660e+01 6.10000000e+01 6.70000000e+01 5.80000000e+01
5.70000000e+01 4.80000000e+01 7.40000000e+01 6.90000000e+01
7.10000000e+01 6.20000000e+01 6.40000000e+01 7.20000000e+01
5.74166679e+01 3.35833321e+01 6.75833359e+01 5.73333321e+01
6.25000000e+01 4.35000000e+01 4.27500000e+01 6.41666641e+01
5.18333321e+01 6.97500000e+01 5.71666679e+01 6.02500000e+01
5.83333321e+01 7.46666641e+01 5.31666679e+01 3.01666660e+01
5.97500000e+01 6.05000000e+01 5.23333321e+01 5.05833321e+01
3.26666679e+01 5.89166679e+01 5.16666679e+01 5.29166679e+01
7.14166641e+01 3.27500000e+01 5.64166679e+01 3.07500000e+01
5.85000000e+01 4.32500000e+01 2.39166660e+01 5.32500000e+01
6.04166679e+01 4.62500000e+01 6.54166641e+01 5.56666679e+01
5.04166679e+01 6.15000000e+01 5.82500000e+01 7.26666641e+01
5.85833321e+01 5.61666679e+01 6.55000000e+01 5.67500000e+01
6.53333359e+01 3.62500000e+01 5.14166679e+01 5.87500000e+01
1.85000000e+01 7.15833359e+01 6.09166679e+01 6.13333321e+01
3.31666679e+01 4.52500000e+01 4.96666679e+01 5.94166679e+01
1.01750000e+02]
Column: sex_frequency
Data type: object
Unique values: ['once_or_twice_a_week' 'once_a_month_or_less' '2_to_3_times_a_month' nan
'3_to_6_times_a_week' 'once_or_more_a_day']
Column: wave
Data type: object
Unique values: ['2017' '2020' '2022']
Column: flirts_with_partner
Data type: category
Unique values: [NaN, 'never', 'a_few_times_a_week', 'less_than_once_a_month', '1_to_3_times_a_month', 'every_day', 'once_a_week']
Categories (6, object): ['every_day' < 'a_few_times_a_week' < 'once_a_week' < '1_to_3_times_a_month' < 'less_than_once_a_month' < 'never']
Column: fights_with_partner
Data type: category
Unique values: [NaN, '0_times', '3_times', '2_times', '1_time', '7_or_more_times', '4_times', '5_times', '6_times']
Categories (8, object): ['0_times' < '1_time' < '2_times' < '3_times' < '4_times' < '5_times' < '6_times' < '7_or_more_times']
Column: rel_change_during_pandemic
Data type: object
Unique values: [nan 'better_than_before' 'no_change' 'worse_than_before']
Column: inc_change_during_pandemic
Data type: object
Unique values: [nan 'no_change' 'worse' 'much_worse' 'better' 'much_better']
Column: subject_had_covid
Data type: object
Unique values: [nan 'no' 'yes']
Column: partner_had_covid
Data type: object
Unique values: [nan 'yes' 'no']
Column: subject_vaccinated
Data type: object
Unique values: [nan 'not_vaccinated' 'fully_vaccinated_and_booster'
'fully_vaccinated_no_booster' 'partially_vaccinated']
Column: partner_vaccinated
Data type: object
Unique values: [nan 'not_vaccinated' 'fully_vaccinated_and_booster'
'partially_vaccinated' 'fully_vaccinated_no_booster']
Column: agree_covid_approach
Data type: object
Unique values: [nan 'completely_agree' 'mostly_agree' 'completely_disagree'
'mostly_disagree']
Column: children
Data type: float64
Unique values: [ 1. 2. 5. 4. 6. 3. 8. 10. 7. 9. 12.]
Selecting only 2022 data set:
During the client meeting on May 21st, 2025. It was decided to only keep 2022 data, while removing mostly sparse columns, id and wave values.
= data
clean
clean.head()
id | wave | subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | partner_sex | partner_age | partner_ethnicity | partner_education | same_sex_couple | married | sex_frequency | flirts_with_partner | fights_with_partner | relationship_start_year | marriage_year | break_up_year | relationship_duration | children | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | relationship_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 53001 | 2017 | 48.0 | high_school_grad | female | 2_plus_eth | 60k_75k | working_paid_employee | male | 46.0 | white | associate_degree | no | married | once_or_twice_a_week | NaN | NaN | 2013.0 | 2014.0 | NaN | 3.583333 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
1 | 71609 | 2017 | 68.0 | some_college | female | white | 50k_60k | working_paid_employee | male | 71.0 | white | some_college | no | married | once_a_month_or_less | NaN | NaN | 1964.0 | 1969.0 | NaN | 52.750000 | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
2 | 106983 | 2017 | 39.0 | associate_degree | male | white | 85k_100k | working_paid_employee | female | 49.0 | white | some_college | no | married | once_or_twice_a_week | NaN | NaN | 1999.0 | 2002.0 | NaN | 17.583334 | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
3 | 121759 | 2017 | 54.0 | high_school_grad | male | white | 100k_125k | working_paid_employee | female | 59.0 | asian | masters_degree | no | married | 2_to_3_times_a_month | NaN | NaN | 1990.0 | 1991.0 | NaN | 27.416666 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | excellent |
4 | 158083 | 2017 | 48.0 | some_college | male | white | 75k_85k | working_paid_employee | female | 34.0 | white | associate_degree | no | not_married | NaN | NaN | NaN | 2011.0 | 2013.0 | 2014.0 | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
clean.shape
(7339, 30)
Selecting only 2022 and removing NA from selected columns.
= clean[clean['wave'] == '2022'].dropna(subset=['rel_change_during_pandemic', 'relationship_quality']) data_2022
data_2022.shape
(1328, 30)
data_2022.columns.tolist()
['id',
'wave',
'subject_age',
'subject_education',
'subject_sex',
'subject_ethnicity',
'subject_income_category',
'subject_employment_status',
'partner_sex',
'partner_age',
'partner_ethnicity',
'partner_education',
'same_sex_couple',
'married',
'sex_frequency',
'flirts_with_partner',
'fights_with_partner',
'relationship_start_year',
'marriage_year',
'break_up_year',
'relationship_duration',
'children',
'rel_change_during_pandemic',
'inc_change_during_pandemic',
'subject_had_covid',
'partner_had_covid',
'subject_vaccinated',
'partner_vaccinated',
'agree_covid_approach',
'relationship_quality']
Final selection of columns for clean dataframe.
= data_2022[[
data_2022 'subject_age',
'subject_education',
'subject_sex',
'subject_ethnicity',
'subject_income_category',
'subject_employment_status',
'same_sex_couple',
'married',
'sex_frequency',
'flirts_with_partner',
'fights_with_partner',
'relationship_duration',
'children',
'rel_change_during_pandemic',
'inc_change_during_pandemic',
'subject_had_covid',
'partner_had_covid',
'subject_vaccinated',
'partner_vaccinated',
'agree_covid_approach',
'relationship_quality']]
data_2022.head()
subject_age | subject_education | subject_sex | subject_ethnicity | subject_income_category | subject_employment_status | same_sex_couple | married | sex_frequency | flirts_with_partner | fights_with_partner | relationship_duration | children | rel_change_during_pandemic | inc_change_during_pandemic | subject_had_covid | partner_had_covid | subject_vaccinated | partner_vaccinated | agree_covid_approach | relationship_quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7020 | 53.0 | high_school_grad | female | white | 35k_40k | working_paid_employee | no | not_married | once_or_twice_a_week | a_few_times_a_week | 0_times | 1.500000 | 2.0 | better_than_before | no_change | no | yes | not_vaccinated | not_vaccinated | completely_agree | excellent |
7021 | 72.0 | some_college | female | white | 75k_85k | working_paid_employee | no | married | once_a_month_or_less | never | 7_or_more_times | 57.416668 | 1.0 | no_change | worse | no | no | fully_vaccinated_and_booster | fully_vaccinated_and_booster | mostly_agree | good |
7022 | 43.0 | associate_degree | male | white | 75k_85k | working_paid_employee | no | married | once_or_twice_a_week | a_few_times_a_week | 2_times | 22.333334 | 5.0 | no_change | worse | no | no | fully_vaccinated_and_booster | fully_vaccinated_and_booster | completely_agree | excellent |
7025 | 64.0 | some_college | male | white | 75k_85k | working_paid_employee | no | married | once_or_twice_a_week | 1_to_3_times_a_month | 0_times | 28.250000 | 2.0 | no_change | no_change | no | no | fully_vaccinated_and_booster | fully_vaccinated_and_booster | completely_agree | good |
7027 | 60.0 | high_school_grad | female | black | 75k_85k | working_paid_employee | no | married | once_or_twice_a_week | a_few_times_a_week | 0_times | 38.916668 | 3.0 | better_than_before | no_change | no | no | not_vaccinated | partially_vaccinated | completely_agree | excellent |
data_2022.shape
(1328, 21)
Saving CSV
"../data/clean/hcmst.csv", index=False) data_2022.to_csv(