Skip to content

Commit e9a6419

Browse files
committed
Flip males
1 parent ea30f03 commit e9a6419

File tree

1 file changed

+163
-28
lines changed

1 file changed

+163
-28
lines changed

resources/equalized_odds_improvement_tutorial.ipynb

Lines changed: 163 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
},
8989
{
9090
"cell_type": "code",
91-
"execution_count": 3,
91+
"execution_count": null,
9292
"metadata": {},
9393
"outputs": [
9494
{
@@ -266,26 +266,57 @@
266266
"real_data, metadata = download_demo('single_table', 'adult')\n",
267267
"\n",
268268
"print(f\"Dataset shape: {real_data.shape}\")\n",
269-
"print(f\"\\nFirst few rows:\")\n",
270-
"real_data.head()"
269+
"print(f\"\\nOriginal data - First few rows:\")\n",
270+
"print(real_data.head())\n",
271+
"\n",
272+
"print(f\"\\nOriginal income distribution by sex:\")\n",
273+
"original_crosstab = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
274+
"print(original_crosstab)"
271275
]
272276
},
273277
{
274278
"cell_type": "code",
275-
"execution_count": 4,
279+
"execution_count": null,
276280
"metadata": {},
277281
"outputs": [],
278282
"source": [
279-
"# Flip the labels for the Female rows\n",
280-
"mask_female = real_data['sex'] == 'Female'\n",
281-
"real_data.loc[mask_female, 'label'] = real_data.loc[mask_female, 'label'].map(\n",
282-
" {'<=50K': '>50K', '>50K': '<=50K'}\n",
283-
")"
283+
"# Create a copy of the original data for our modifications\n",
284+
"modified_data = real_data.copy()\n",
285+
"\n",
286+
"# For sex=Male: If salary is <50K, flip it to >=50K with 25% probability\n",
287+
"# If salary is >=50K, keep as-is\n",
288+
"# Keep sex=Female as-is\n",
289+
"\n",
290+
"np.random.seed(42) # For reproducibility\n",
291+
"\n",
292+
"# Find Male rows with <50K salary\n",
293+
"mask_male_low_salary = (modified_data['sex'] == 'Male') & (modified_data['label'] == '<=50K')\n",
294+
"male_low_salary_indices = modified_data[mask_male_low_salary].index\n",
295+
"\n",
296+
"# Generate random probabilities for each Male with <50K salary\n",
297+
"random_probs = np.random.random(len(male_low_salary_indices))\n",
298+
"\n",
299+
"# Flip to >=50K with 25% probability\n",
300+
"flip_mask = random_probs < 0.25\n",
301+
"indices_to_flip = male_low_salary_indices[flip_mask]\n",
302+
"\n",
303+
"print(f\"Total Males with <=50K salary: {len(male_low_salary_indices)}\")\n",
304+
"print(f\"Males being flipped to >50K (25% probability): {len(indices_to_flip)}\")\n",
305+
"\n",
306+
"# Apply the flips\n",
307+
"modified_data.loc[indices_to_flip, 'label'] = '>50K'\n",
308+
"\n",
309+
"print(f\"\\nModified income distribution by sex:\")\n",
310+
"modified_crosstab = pd.crosstab(modified_data['sex'], modified_data['label'], normalize='index') * 100\n",
311+
"print(modified_crosstab)\n",
312+
"\n",
313+
"# Use the modified data for the rest of the analysis\n",
314+
"real_data = modified_data"
284315
]
285316
},
286317
{
287318
"cell_type": "code",
288-
"execution_count": 5,
319+
"execution_count": null,
289320
"metadata": {},
290321
"outputs": [
291322
{
@@ -300,25 +331,47 @@
300331
}
301332
],
302333
"source": [
303-
"# Visualize the distributions\n",
304-
"fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
305-
"\n",
306-
"# income distribution by sex\n",
307-
"crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
308-
"crosstab_pct.plot(kind='bar', ax=axes[0], rot=0)\n",
309-
"axes[0].set_title('Income Distribution by Sex (%)')\n",
310-
"axes[0].set_xlabel('Sex')\n",
311-
"axes[0].set_ylabel('Percentage')\n",
312-
"axes[0].legend(title='Income')\n",
313-
"\n",
314-
"# Overall income distribution\n",
315-
"real_data['label'].value_counts().plot(kind='bar', ax=axes[1], rot=0)\n",
316-
"axes[1].set_title('Overall Income Distribution')\n",
317-
"axes[1].set_xlabel('Income')\n",
318-
"axes[1].set_ylabel('Count')\n",
334+
"# Visualize the distributions - comparison between original and modified\n",
335+
"fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
336+
"\n",
337+
"# Load original data for comparison\n",
338+
"original_data, _ = download_demo('single_table', 'adult')\n",
339+
"\n",
340+
"# Original income distribution by sex\n",
341+
"original_crosstab_pct = pd.crosstab(original_data['sex'], original_data['label'], normalize='index') * 100\n",
342+
"original_crosstab_pct.plot(kind='bar', ax=axes[0, 0], rot=0, color=['lightcoral', 'lightblue'])\n",
343+
"axes[0, 0].set_title('Original: Income Distribution by Sex (%)')\n",
344+
"axes[0, 0].set_xlabel('Sex')\n",
345+
"axes[0, 0].set_ylabel('Percentage')\n",
346+
"axes[0, 0].legend(title='Income')\n",
347+
"\n",
348+
"# Modified income distribution by sex\n",
349+
"modified_crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
350+
"modified_crosstab_pct.plot(kind='bar', ax=axes[0, 1], rot=0, color=['lightcoral', 'lightblue'])\n",
351+
"axes[0, 1].set_title('Modified: Income Distribution by Sex (%)')\n",
352+
"axes[0, 1].set_xlabel('Sex')\n",
353+
"axes[0, 1].set_ylabel('Percentage')\n",
354+
"axes[0, 1].legend(title='Income')\n",
355+
"\n",
356+
"# Original overall income distribution\n",
357+
"original_data['label'].value_counts().plot(kind='bar', ax=axes[1, 0], rot=0, color=['lightcoral', 'lightblue'])\n",
358+
"axes[1, 0].set_title('Original: Overall Income Distribution')\n",
359+
"axes[1, 0].set_xlabel('Income')\n",
360+
"axes[1, 0].set_ylabel('Count')\n",
361+
"\n",
362+
"# Modified overall income distribution\n",
363+
"real_data['label'].value_counts().plot(kind='bar', ax=axes[1, 1], rot=0, color=['lightcoral', 'lightblue'])\n",
364+
"axes[1, 1].set_title('Modified: Overall Income Distribution')\n",
365+
"axes[1, 1].set_xlabel('Income')\n",
366+
"axes[1, 1].set_ylabel('Count')\n",
319367
"\n",
320368
"plt.tight_layout()\n",
321-
"plt.show()"
369+
"plt.show()\n",
370+
"\n",
371+
"print(\"Summary of changes:\")\n",
372+
"print(\"Original data - Males with >50K salary: {:.1f}%\".format(original_crosstab_pct.loc['Male', '>50K']))\n",
373+
"print(\"Modified data - Males with >50K salary: {:.1f}%\".format(modified_crosstab_pct.loc['Male', '>50K']))\n",
374+
"print(\"Females remain unchanged: {:.1f}%\".format(modified_crosstab_pct.loc['Female', '>50K']))"
322375
]
323376
},
324377
{
@@ -486,7 +539,15 @@
486539
"print(f\"\\nScore Interpretation:\")\n",
487540
"print(f\"- Score > 0.5 means synthetic data improves fairness\")\n",
488541
"print(f\"- Score < 0.5 means synthetic data worsens fairness\")\n",
489-
"print(f\"- Score = 0.5 means no change in fairness\")"
542+
"print(f\"- Score = 0.5 means no change in fairness\")\n",
543+
"\n",
544+
"print(f\"\\nParameters used (as requested):\")\n",
545+
"print(f\"- positive_class_label='>50K'\")\n",
546+
"print(f\"- sensitive_column_name='sex'\")\n",
547+
"print(f\"- sensitive_column_value='Female'\")\n",
548+
"\n",
549+
"print(f\"\\nValidation data distribution after modification:\")\n",
550+
"print(pd.crosstab(validation_data['label'], validation_data['sex']))"
490551
]
491552
},
492553
{
@@ -727,6 +788,80 @@
727788
"Let's compare the results from both approaches to analyze the impact of balanced sampling on fairness."
728789
]
729790
},
791+
{
792+
"cell_type": "raw",
793+
"metadata": {
794+
"vscode": {
795+
"languageId": "raw"
796+
}
797+
},
798+
"source": [
799+
"## Step 8: Analysis of the Modified Data Approach\n",
800+
"\n",
801+
"The approach we used modifies the original data to make >=50K income several times more likely for Males while keeping Females unchanged. This creates an intentional bias to test the fairness metrics.\n",
802+
"\n",
803+
"Key aspects of this approach:\n",
804+
"1. **Preserves minority label**: >=50K remains the minority class overall\n",
805+
"2. **Creates gender disparity**: Males now have significantly higher rates of >50K income\n",
806+
"3. **Tests metric sensitivity**: Evaluates how well the fairness metric detects this intentional bias\n"
807+
]
808+
},
809+
{
810+
"cell_type": "code",
811+
"execution_count": null,
812+
"metadata": {},
813+
"outputs": [],
814+
"source": [
815+
"# Let's analyze the impact of our modification approach\n",
816+
"print(\"=== ANALYSIS OF MODIFIED DATA APPROACH ===\")\n",
817+
"print()\n",
818+
"\n",
819+
"# Calculate exact probabilities\n",
820+
"print(\"1. Impact on Male income distribution:\")\n",
821+
"original_data_reload, _ = download_demo('single_table', 'adult')\n",
822+
"original_male_high_rate = (original_data_reload['sex'] == 'Male') & (original_data_reload['label'] == '>50K')\n",
823+
"original_male_total = (original_data_reload['sex'] == 'Male')\n",
824+
"original_male_pct = original_male_high_rate.sum() / original_male_total.sum() * 100\n",
825+
"\n",
826+
"modified_male_high_rate = (real_data['sex'] == 'Male') & (real_data['label'] == '>50K')\n",
827+
"modified_male_total = (real_data['sex'] == 'Male')\n",
828+
"modified_male_pct = modified_male_high_rate.sum() / modified_male_total.sum() * 100\n",
829+
"\n",
830+
"print(f\" Original: {original_male_pct:.1f}% of Males have >50K income\")\n",
831+
"print(f\" Modified: {modified_male_pct:.1f}% of Males have >50K income\")\n",
832+
"print(f\" Improvement: +{modified_male_pct - original_male_pct:.1f} percentage points\")\n",
833+
"\n",
834+
"print()\n",
835+
"print(\"2. Female distribution (unchanged):\")\n",
836+
"original_female_high_rate = (original_data_reload['sex'] == 'Female') & (original_data_reload['label'] == '>50K')\n",
837+
"original_female_total = (original_data_reload['sex'] == 'Female')\n",
838+
"original_female_pct = original_female_high_rate.sum() / original_female_total.sum() * 100\n",
839+
"\n",
840+
"modified_female_high_rate = (real_data['sex'] == 'Female') & (real_data['label'] == '>50K')\n",
841+
"modified_female_total = (real_data['sex'] == 'Female')\n",
842+
"modified_female_pct = modified_female_high_rate.sum() / modified_female_total.sum() * 100\n",
843+
"\n",
844+
"print(f\" Original: {original_female_pct:.1f}% of Females have >50K income\")\n",
845+
"print(f\" Modified: {modified_female_pct:.1f}% of Females have >50K income\")\n",
846+
"\n",
847+
"print()\n",
848+
"print(\"3. Overall minority label preservation:\")\n",
849+
"original_high_pct = (original_data_reload['label'] == '>50K').sum() / len(original_data_reload) * 100\n",
850+
"modified_high_pct = (real_data['label'] == '>50K').sum() / len(real_data) * 100\n",
851+
"\n",
852+
"print(f\" Original: {original_high_pct:.1f}% overall have >50K income\")\n",
853+
"print(f\" Modified: {modified_high_pct:.1f}% overall have >50K income\")\n",
854+
"\n",
855+
"print()\n",
856+
"print(\"4. Gender disparity created:\")\n",
857+
"disparity_original = original_male_pct - original_female_pct\n",
858+
"disparity_modified = modified_male_pct - modified_female_pct\n",
859+
"\n",
860+
"print(f\" Original disparity (Male - Female): {disparity_original:.1f} percentage points\")\n",
861+
"print(f\" Modified disparity (Male - Female): {disparity_modified:.1f} percentage points\")\n",
862+
"print(f\" Disparity increase: +{disparity_modified - disparity_original:.1f} percentage points\")\n"
863+
]
864+
},
730865
{
731866
"cell_type": "code",
732867
"execution_count": 14,

0 commit comments

Comments
 (0)