Flip males

fealho · fealho · commit e9a641937aac · 2025-06-26T12:41:18.000-07:00
diff --git a/resources/equalized_odds_improvement_tutorial.ipynb b/resources/equalized_odds_improvement_tutorial.ipynb
@@ -88,7 +88,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -266,26 +266,57 @@
     "real_data, metadata = download_demo('single_table', 'adult')\n",
     "\n",
     "print(f\"Dataset shape: {real_data.shape}\")\n",
-    "print(f\"\\nFirst few rows:\")\n",
-    "real_data.head()"
+    "print(f\"\\nOriginal data - First few rows:\")\n",
+    "print(real_data.head())\n",
+    "\n",
+    "print(f\"\\nOriginal income distribution by sex:\")\n",
+    "original_crosstab = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
+    "print(original_crosstab)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Flip the labels for the Female rows\n",
-    "mask_female = real_data['sex'] == 'Female'\n",
-    "real_data.loc[mask_female, 'label'] = real_data.loc[mask_female, 'label'].map(\n",
-    "    {'<=50K': '>50K', '>50K': '<=50K'}\n",
-    ")"
+    "# Create a copy of the original data for our modifications\n",
+    "modified_data = real_data.copy()\n",
+    "\n",
+    "# For sex=Male: If salary is <50K, flip it to >=50K with 25% probability\n",
+    "# If salary is >=50K, keep as-is\n",
+    "# Keep sex=Female as-is\n",
+    "\n",
+    "np.random.seed(42)  # For reproducibility\n",
+    "\n",
+    "# Find Male rows with <50K salary\n",
+    "mask_male_low_salary = (modified_data['sex'] == 'Male') & (modified_data['label'] == '<=50K')\n",
+    "male_low_salary_indices = modified_data[mask_male_low_salary].index\n",
+    "\n",
+    "# Generate random probabilities for each Male with <50K salary\n",
+    "random_probs = np.random.random(len(male_low_salary_indices))\n",
+    "\n",
+    "# Flip to >=50K with 25% probability\n",
+    "flip_mask = random_probs < 0.25\n",
+    "indices_to_flip = male_low_salary_indices[flip_mask]\n",
+    "\n",
+    "print(f\"Total Males with <=50K salary: {len(male_low_salary_indices)}\")\n",
+    "print(f\"Males being flipped to >50K (25% probability): {len(indices_to_flip)}\")\n",
+    "\n",
+    "# Apply the flips\n",
+    "modified_data.loc[indices_to_flip, 'label'] = '>50K'\n",
+    "\n",
+    "print(f\"\\nModified income distribution by sex:\")\n",
+    "modified_crosstab = pd.crosstab(modified_data['sex'], modified_data['label'], normalize='index') * 100\n",
+    "print(modified_crosstab)\n",
+    "\n",
+    "# Use the modified data for the rest of the analysis\n",
+    "real_data = modified_data"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -300,25 +331,47 @@
     }
    ],
    "source": [
-    "# Visualize the distributions\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
-    "\n",
-    "# income distribution by sex\n",
-    "crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
-    "crosstab_pct.plot(kind='bar', ax=axes[0], rot=0)\n",
-    "axes[0].set_title('Income Distribution by Sex (%)')\n",
-    "axes[0].set_xlabel('Sex')\n",
-    "axes[0].set_ylabel('Percentage')\n",
-    "axes[0].legend(title='Income')\n",
-    "\n",
-    "# Overall income distribution\n",
-    "real_data['label'].value_counts().plot(kind='bar', ax=axes[1], rot=0)\n",
-    "axes[1].set_title('Overall Income Distribution')\n",
-    "axes[1].set_xlabel('Income')\n",
-    "axes[1].set_ylabel('Count')\n",
+    "# Visualize the distributions - comparison between original and modified\n",
+    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
+    "\n",
+    "# Load original data for comparison\n",
+    "original_data, _ = download_demo('single_table', 'adult')\n",
+    "\n",
+    "# Original income distribution by sex\n",
+    "original_crosstab_pct = pd.crosstab(original_data['sex'], original_data['label'], normalize='index') * 100\n",
+    "original_crosstab_pct.plot(kind='bar', ax=axes[0, 0], rot=0, color=['lightcoral', 'lightblue'])\n",
+    "axes[0, 0].set_title('Original: Income Distribution by Sex (%)')\n",
+    "axes[0, 0].set_xlabel('Sex')\n",
+    "axes[0, 0].set_ylabel('Percentage')\n",
+    "axes[0, 0].legend(title='Income')\n",
+    "\n",
+    "# Modified income distribution by sex\n",
+    "modified_crosstab_pct = pd.crosstab(real_data['sex'], real_data['label'], normalize='index') * 100\n",
+    "modified_crosstab_pct.plot(kind='bar', ax=axes[0, 1], rot=0, color=['lightcoral', 'lightblue'])\n",
+    "axes[0, 1].set_title('Modified: Income Distribution by Sex (%)')\n",
+    "axes[0, 1].set_xlabel('Sex')\n",
+    "axes[0, 1].set_ylabel('Percentage')\n",
+    "axes[0, 1].legend(title='Income')\n",
+    "\n",
+    "# Original overall income distribution\n",
+    "original_data['label'].value_counts().plot(kind='bar', ax=axes[1, 0], rot=0, color=['lightcoral', 'lightblue'])\n",
+    "axes[1, 0].set_title('Original: Overall Income Distribution')\n",
+    "axes[1, 0].set_xlabel('Income')\n",
+    "axes[1, 0].set_ylabel('Count')\n",
+    "\n",
+    "# Modified overall income distribution\n",
+    "real_data['label'].value_counts().plot(kind='bar', ax=axes[1, 1], rot=0, color=['lightcoral', 'lightblue'])\n",
+    "axes[1, 1].set_title('Modified: Overall Income Distribution')\n",
+    "axes[1, 1].set_xlabel('Income')\n",
+    "axes[1, 1].set_ylabel('Count')\n",
     "\n",
     "plt.tight_layout()\n",
-    "plt.show()"
+    "plt.show()\n",
+    "\n",
+    "print(\"Summary of changes:\")\n",
+    "print(\"Original data - Males with >50K salary: {:.1f}%\".format(original_crosstab_pct.loc['Male', '>50K']))\n",
+    "print(\"Modified data - Males with >50K salary: {:.1f}%\".format(modified_crosstab_pct.loc['Male', '>50K']))\n",
+    "print(\"Females remain unchanged: {:.1f}%\".format(modified_crosstab_pct.loc['Female', '>50K']))"
    ]
   },
   {
@@ -486,7 +539,15 @@
     "print(f\"\\nScore Interpretation:\")\n",
     "print(f\"- Score > 0.5 means synthetic data improves fairness\")\n",
     "print(f\"- Score < 0.5 means synthetic data worsens fairness\")\n",
-    "print(f\"- Score = 0.5 means no change in fairness\")"
+    "print(f\"- Score = 0.5 means no change in fairness\")\n",
+    "\n",
+    "print(f\"\\nParameters used (as requested):\")\n",
+    "print(f\"- positive_class_label='>50K'\")\n",
+    "print(f\"- sensitive_column_name='sex'\")\n",
+    "print(f\"- sensitive_column_value='Female'\")\n",
+    "\n",
+    "print(f\"\\nValidation data distribution after modification:\")\n",
+    "print(pd.crosstab(validation_data['label'], validation_data['sex']))"
    ]
   },
   {
@@ -727,6 +788,80 @@
     "Let's compare the results from both approaches to analyze the impact of balanced sampling on fairness."
    ]
   },
+  {
+   "cell_type": "raw",
+   "metadata": {
+    "vscode": {
+     "languageId": "raw"
+    }
+   },
+   "source": [
+    "## Step 8: Analysis of the Modified Data Approach\n",
+    "\n",
+    "The approach we used modifies the original data to make >=50K income several times more likely for Males while keeping Females unchanged. This creates an intentional bias to test the fairness metrics.\n",
+    "\n",
+    "Key aspects of this approach:\n",
+    "1. **Preserves minority label**: >=50K remains the minority class overall\n",
+    "2. **Creates gender disparity**: Males now have significantly higher rates of >50K income\n",
+    "3. **Tests metric sensitivity**: Evaluates how well the fairness metric detects this intentional bias\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's analyze the impact of our modification approach\n",
+    "print(\"=== ANALYSIS OF MODIFIED DATA APPROACH ===\")\n",
+    "print()\n",
+    "\n",
+    "# Calculate exact probabilities\n",
+    "print(\"1. Impact on Male income distribution:\")\n",
+    "original_data_reload, _ = download_demo('single_table', 'adult')\n",
+    "original_male_high_rate = (original_data_reload['sex'] == 'Male') & (original_data_reload['label'] == '>50K')\n",
+    "original_male_total = (original_data_reload['sex'] == 'Male')\n",
+    "original_male_pct = original_male_high_rate.sum() / original_male_total.sum() * 100\n",
+    "\n",
+    "modified_male_high_rate = (real_data['sex'] == 'Male') & (real_data['label'] == '>50K')\n",
+    "modified_male_total = (real_data['sex'] == 'Male')\n",
+    "modified_male_pct = modified_male_high_rate.sum() / modified_male_total.sum() * 100\n",
+    "\n",
+    "print(f\"   Original: {original_male_pct:.1f}% of Males have >50K income\")\n",
+    "print(f\"   Modified: {modified_male_pct:.1f}% of Males have >50K income\")\n",
+    "print(f\"   Improvement: +{modified_male_pct - original_male_pct:.1f} percentage points\")\n",
+    "\n",
+    "print()\n",
+    "print(\"2. Female distribution (unchanged):\")\n",
+    "original_female_high_rate = (original_data_reload['sex'] == 'Female') & (original_data_reload['label'] == '>50K')\n",
+    "original_female_total = (original_data_reload['sex'] == 'Female')\n",
+    "original_female_pct = original_female_high_rate.sum() / original_female_total.sum() * 100\n",
+    "\n",
+    "modified_female_high_rate = (real_data['sex'] == 'Female') & (real_data['label'] == '>50K')\n",
+    "modified_female_total = (real_data['sex'] == 'Female')\n",
+    "modified_female_pct = modified_female_high_rate.sum() / modified_female_total.sum() * 100\n",
+    "\n",
+    "print(f\"   Original: {original_female_pct:.1f}% of Females have >50K income\")\n",
+    "print(f\"   Modified: {modified_female_pct:.1f}% of Females have >50K income\")\n",
+    "\n",
+    "print()\n",
+    "print(\"3. Overall minority label preservation:\")\n",
+    "original_high_pct = (original_data_reload['label'] == '>50K').sum() / len(original_data_reload) * 100\n",
+    "modified_high_pct = (real_data['label'] == '>50K').sum() / len(real_data) * 100\n",
+    "\n",
+    "print(f\"   Original: {original_high_pct:.1f}% overall have >50K income\")\n",
+    "print(f\"   Modified: {modified_high_pct:.1f}% overall have >50K income\")\n",
+    "\n",
+    "print()\n",
+    "print(\"4. Gender disparity created:\")\n",
+    "disparity_original = original_male_pct - original_female_pct\n",
+    "disparity_modified = modified_male_pct - modified_female_pct\n",
+    "\n",
+    "print(f\"   Original disparity (Male - Female): {disparity_original:.1f} percentage points\")\n",
+    "print(f\"   Modified disparity (Male - Female): {disparity_modified:.1f} percentage points\")\n",
+    "print(f\"   Disparity increase: +{disparity_modified - disparity_original:.1f} percentage points\")\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 14,