|
88 | 88 | },
|
89 | 89 | {
|
90 | 90 | "cell_type": "code",
|
91 |
| - "execution_count": 3, |
| 91 | + "execution_count": null, |
92 | 92 | "metadata": {},
|
93 | 93 | "outputs": [
|
94 | 94 | {
|
|
266 | 266 | "real_data, metadata = download_demo('single_table', 'adult')\n",
|
267 | 267 | "\n",
|
268 | 268 | "print(f\"Dataset shape: {real_data.shape}\")\n",
|
269 |
| - "print(f\"\\nFirst few rows:\")\n", |
270 |
| - "real_data.head()" |
| 269 | + "print(f\"\\nOriginal data - First few rows:\")\n", |
| 270 | + "print(real_data.head())\n", |
| 271 | + "\n", |
| 272 | + "print(f\"\\nOriginal income distribution by sex:\")\n", |
| 273 | + "original_crosstab = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n", |
| 274 | + "print(original_crosstab)" |
271 | 275 | ]
|
272 | 276 | },
|
273 | 277 | {
|
274 | 278 | "cell_type": "code",
|
275 |
| - "execution_count": 4, |
| 279 | + "execution_count": null, |
276 | 280 | "metadata": {},
|
277 | 281 | "outputs": [],
|
278 | 282 | "source": [
|
279 |
| - "# Flip the labels for the Female rows\n", |
280 |
| - "mask_female = real_data['sex'] == 'Female'\n", |
281 |
| - "real_data.loc[mask_female, 'label'] = real_data.loc[mask_female, 'label'].map(\n", |
282 |
| - " {'<=50K': '>50K', '>50K': '<=50K'}\n", |
283 |
| - ")" |
| 283 | + "# Create a copy of the original data for our modifications\n", |
| 284 | + "modified_data = real_data.copy()\n", |
| 285 | + "\n", |
| 286 | + "# For sex=Male: If salary is <50K, flip it to >=50K with 25% probability\n", |
| 287 | + "# If salary is >=50K, keep as-is\n", |
| 288 | + "# Keep sex=Female as-is\n", |
| 289 | + "\n", |
| 290 | + "np.random.seed(42) # For reproducibility\n", |
| 291 | + "\n", |
| 292 | + "# Find Male rows with <50K salary\n", |
| 293 | + "mask_male_low_salary = (modified_data['sex'] == 'Male') & (modified_data['label'] == '<=50K')\n", |
| 294 | + "male_low_salary_indices = modified_data[mask_male_low_salary].index\n", |
| 295 | + "\n", |
| 296 | + "# Generate random probabilities for each Male with <50K salary\n", |
| 297 | + "random_probs = np.random.random(len(male_low_salary_indices))\n", |
| 298 | + "\n", |
| 299 | + "# Flip to >=50K with 25% probability\n", |
| 300 | + "flip_mask = random_probs < 0.25\n", |
| 301 | + "indices_to_flip = male_low_salary_indices[flip_mask]\n", |
| 302 | + "\n", |
| 303 | + "print(f\"Total Males with <=50K salary: {len(male_low_salary_indices)}\")\n", |
| 304 | + "print(f\"Males being flipped to >50K (25% probability): {len(indices_to_flip)}\")\n", |
| 305 | + "\n", |
| 306 | + "# Apply the flips\n", |
| 307 | + "modified_data.loc[indices_to_flip, 'label'] = '>50K'\n", |
| 308 | + "\n", |
| 309 | + "print(f\"\\nModified income distribution by sex:\")\n", |
| 310 | + "modified_crosstab = pd.crosstab(modified_data['sex'], modified_data['label'], normalize='index') * 100\n", |
| 311 | + "print(modified_crosstab)\n", |
| 312 | + "\n", |
| 313 | + "# Use the modified data for the rest of the analysis\n", |
| 314 | + "real_data = modified_data" |
284 | 315 | ]
|
285 | 316 | },
|
286 | 317 | {
|
287 | 318 | "cell_type": "code",
|
288 |
| - "execution_count": 5, |
| 319 | + "execution_count": null, |
289 | 320 | "metadata": {},
|
290 | 321 | "outputs": [
|
291 | 322 | {
|
|
300 | 331 | }
|
301 | 332 | ],
|
302 | 333 | "source": [
|
303 |
| - "# Visualize the distributions\n", |
304 |
| - "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n", |
305 |
| - "\n", |
306 |
| - "# income distribution by sex\n", |
307 |
| - "crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n", |
308 |
| - "crosstab_pct.plot(kind='bar', ax=axes[0], rot=0)\n", |
309 |
| - "axes[0].set_title('Income Distribution by Sex (%)')\n", |
310 |
| - "axes[0].set_xlabel('Sex')\n", |
311 |
| - "axes[0].set_ylabel('Percentage')\n", |
312 |
| - "axes[0].legend(title='Income')\n", |
313 |
| - "\n", |
314 |
| - "# Overall income distribution\n", |
315 |
| - "real_data['label'].value_counts().plot(kind='bar', ax=axes[1], rot=0)\n", |
316 |
| - "axes[1].set_title('Overall Income Distribution')\n", |
317 |
| - "axes[1].set_xlabel('Income')\n", |
318 |
| - "axes[1].set_ylabel('Count')\n", |
| 334 | + "# Visualize the distributions - comparison between original and modified\n", |
| 335 | + "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n", |
| 336 | + "\n", |
| 337 | + "# Load original data for comparison\n", |
| 338 | + "original_data, _ = download_demo('single_table', 'adult')\n", |
| 339 | + "\n", |
| 340 | + "# Original income distribution by sex\n", |
| 341 | + "original_crosstab_pct = pd.crosstab(original_data['sex'], original_data['label'], normalize='index') * 100\n", |
| 342 | + "original_crosstab_pct.plot(kind='bar', ax=axes[0, 0], rot=0, color=['lightcoral', 'lightblue'])\n", |
| 343 | + "axes[0, 0].set_title('Original: Income Distribution by Sex (%)')\n", |
| 344 | + "axes[0, 0].set_xlabel('Sex')\n", |
| 345 | + "axes[0, 0].set_ylabel('Percentage')\n", |
| 346 | + "axes[0, 0].legend(title='Income')\n", |
| 347 | + "\n", |
| 348 | + "# Modified income distribution by sex\n", |
| 349 | + "modified_crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n", |
| 350 | + "modified_crosstab_pct.plot(kind='bar', ax=axes[0, 1], rot=0, color=['lightcoral', 'lightblue'])\n", |
| 351 | + "axes[0, 1].set_title('Modified: Income Distribution by Sex (%)')\n", |
| 352 | + "axes[0, 1].set_xlabel('Sex')\n", |
| 353 | + "axes[0, 1].set_ylabel('Percentage')\n", |
| 354 | + "axes[0, 1].legend(title='Income')\n", |
| 355 | + "\n", |
| 356 | + "# Original overall income distribution\n", |
| 357 | + "original_data['label'].value_counts().plot(kind='bar', ax=axes[1, 0], rot=0, color=['lightcoral', 'lightblue'])\n", |
| 358 | + "axes[1, 0].set_title('Original: Overall Income Distribution')\n", |
| 359 | + "axes[1, 0].set_xlabel('Income')\n", |
| 360 | + "axes[1, 0].set_ylabel('Count')\n", |
| 361 | + "\n", |
| 362 | + "# Modified overall income distribution\n", |
| 363 | + "real_data['label'].value_counts().plot(kind='bar', ax=axes[1, 1], rot=0, color=['lightcoral', 'lightblue'])\n", |
| 364 | + "axes[1, 1].set_title('Modified: Overall Income Distribution')\n", |
| 365 | + "axes[1, 1].set_xlabel('Income')\n", |
| 366 | + "axes[1, 1].set_ylabel('Count')\n", |
319 | 367 | "\n",
|
320 | 368 | "plt.tight_layout()\n",
|
321 |
| - "plt.show()" |
| 369 | + "plt.show()\n", |
| 370 | + "\n", |
| 371 | + "print(\"Summary of changes:\")\n", |
| 372 | + "print(\"Original data - Males with >50K salary: {:.1f}%\".format(original_crosstab_pct.loc['Male', '>50K']))\n", |
| 373 | + "print(\"Modified data - Males with >50K salary: {:.1f}%\".format(modified_crosstab_pct.loc['Male', '>50K']))\n", |
| 374 | + "print(\"Females remain unchanged: {:.1f}%\".format(modified_crosstab_pct.loc['Female', '>50K']))" |
322 | 375 | ]
|
323 | 376 | },
|
324 | 377 | {
|
|
486 | 539 | "print(f\"\\nScore Interpretation:\")\n",
|
487 | 540 | "print(f\"- Score > 0.5 means synthetic data improves fairness\")\n",
|
488 | 541 | "print(f\"- Score < 0.5 means synthetic data worsens fairness\")\n",
|
489 |
| - "print(f\"- Score = 0.5 means no change in fairness\")" |
| 542 | + "print(f\"- Score = 0.5 means no change in fairness\")\n", |
| 543 | + "\n", |
| 544 | + "print(f\"\\nParameters used (as requested):\")\n", |
| 545 | + "print(f\"- positive_class_label='>50K'\")\n", |
| 546 | + "print(f\"- sensitive_column_name='sex'\")\n", |
| 547 | + "print(f\"- sensitive_column_value='Female'\")\n", |
| 548 | + "\n", |
| 549 | + "print(f\"\\nValidation data distribution after modification:\")\n", |
| 550 | + "print(pd.crosstab(validation_data['label'], validation_data['sex']))" |
490 | 551 | ]
|
491 | 552 | },
|
492 | 553 | {
|
|
727 | 788 | "Let's compare the results from both approaches to analyze the impact of balanced sampling on fairness."
|
728 | 789 | ]
|
729 | 790 | },
|
| 791 | + { |
| 792 | + "cell_type": "raw", |
| 793 | + "metadata": { |
| 794 | + "vscode": { |
| 795 | + "languageId": "raw" |
| 796 | + } |
| 797 | + }, |
| 798 | + "source": [ |
| 799 | + "## Step 8: Analysis of the Modified Data Approach\n", |
| 800 | + "\n", |
| 801 | + "The approach we used modifies the original data to make >=50K income several times more likely for Males while keeping Females unchanged. This creates an intentional bias to test the fairness metrics.\n", |
| 802 | + "\n", |
| 803 | + "Key aspects of this approach:\n", |
| 804 | + "1. **Preserves minority label**: >=50K remains the minority class overall\n", |
| 805 | + "2. **Creates gender disparity**: Males now have significantly higher rates of >50K income\n", |
| 806 | + "3. **Tests metric sensitivity**: Evaluates how well the fairness metric detects this intentional bias\n" |
| 807 | + ] |
| 808 | + }, |
| 809 | + { |
| 810 | + "cell_type": "code", |
| 811 | + "execution_count": null, |
| 812 | + "metadata": {}, |
| 813 | + "outputs": [], |
| 814 | + "source": [ |
| 815 | + "# Let's analyze the impact of our modification approach\n", |
| 816 | + "print(\"=== ANALYSIS OF MODIFIED DATA APPROACH ===\")\n", |
| 817 | + "print()\n", |
| 818 | + "\n", |
| 819 | + "# Calculate exact probabilities\n", |
| 820 | + "print(\"1. Impact on Male income distribution:\")\n", |
| 821 | + "original_data_reload, _ = download_demo('single_table', 'adult')\n", |
| 822 | + "original_male_high_rate = (original_data_reload['sex'] == 'Male') & (original_data_reload['label'] == '>50K')\n", |
| 823 | + "original_male_total = (original_data_reload['sex'] == 'Male')\n", |
| 824 | + "original_male_pct = original_male_high_rate.sum() / original_male_total.sum() * 100\n", |
| 825 | + "\n", |
| 826 | + "modified_male_high_rate = (real_data['sex'] == 'Male') & (real_data['label'] == '>50K')\n", |
| 827 | + "modified_male_total = (real_data['sex'] == 'Male')\n", |
| 828 | + "modified_male_pct = modified_male_high_rate.sum() / modified_male_total.sum() * 100\n", |
| 829 | + "\n", |
| 830 | + "print(f\" Original: {original_male_pct:.1f}% of Males have >50K income\")\n", |
| 831 | + "print(f\" Modified: {modified_male_pct:.1f}% of Males have >50K income\")\n", |
| 832 | + "print(f\" Improvement: +{modified_male_pct - original_male_pct:.1f} percentage points\")\n", |
| 833 | + "\n", |
| 834 | + "print()\n", |
| 835 | + "print(\"2. Female distribution (unchanged):\")\n", |
| 836 | + "original_female_high_rate = (original_data_reload['sex'] == 'Female') & (original_data_reload['label'] == '>50K')\n", |
| 837 | + "original_female_total = (original_data_reload['sex'] == 'Female')\n", |
| 838 | + "original_female_pct = original_female_high_rate.sum() / original_female_total.sum() * 100\n", |
| 839 | + "\n", |
| 840 | + "modified_female_high_rate = (real_data['sex'] == 'Female') & (real_data['label'] == '>50K')\n", |
| 841 | + "modified_female_total = (real_data['sex'] == 'Female')\n", |
| 842 | + "modified_female_pct = modified_female_high_rate.sum() / modified_female_total.sum() * 100\n", |
| 843 | + "\n", |
| 844 | + "print(f\" Original: {original_female_pct:.1f}% of Females have >50K income\")\n", |
| 845 | + "print(f\" Modified: {modified_female_pct:.1f}% of Females have >50K income\")\n", |
| 846 | + "\n", |
| 847 | + "print()\n", |
| 848 | + "print(\"3. Overall minority label preservation:\")\n", |
| 849 | + "original_high_pct = (original_data_reload['label'] == '>50K').sum() / len(original_data_reload) * 100\n", |
| 850 | + "modified_high_pct = (real_data['label'] == '>50K').sum() / len(real_data) * 100\n", |
| 851 | + "\n", |
| 852 | + "print(f\" Original: {original_high_pct:.1f}% overall have >50K income\")\n", |
| 853 | + "print(f\" Modified: {modified_high_pct:.1f}% overall have >50K income\")\n", |
| 854 | + "\n", |
| 855 | + "print()\n", |
| 856 | + "print(\"4. Gender disparity created:\")\n", |
| 857 | + "disparity_original = original_male_pct - original_female_pct\n", |
| 858 | + "disparity_modified = modified_male_pct - modified_female_pct\n", |
| 859 | + "\n", |
| 860 | + "print(f\" Original disparity (Male - Female): {disparity_original:.1f} percentage points\")\n", |
| 861 | + "print(f\" Modified disparity (Male - Female): {disparity_modified:.1f} percentage points\")\n", |
| 862 | + "print(f\" Disparity increase: +{disparity_modified - disparity_original:.1f} percentage points\")\n" |
| 863 | + ] |
| 864 | + }, |
730 | 865 | {
|
731 | 866 | "cell_type": "code",
|
732 | 867 | "execution_count": 14,
|
|
0 commit comments