From 8cab10c54205d671cef0b4057cb61159774b3015 Mon Sep 17 00:00:00 2001
From: happpycorn <135821359+happpycorn@users.noreply.github.com>
Date: Thu, 9 Oct 2025 02:49:41 +0000
Subject: [PATCH 1/2] update
---
ML_data_preprocessing.ipynb | 891 +++++++++++++++++-------------------
1 file changed, 428 insertions(+), 463 deletions(-)
diff --git a/ML_data_preprocessing.ipynb b/ML_data_preprocessing.ipynb
index 03cbab5..7ccef6f 100644
--- a/ML_data_preprocessing.ipynb
+++ b/ML_data_preprocessing.ipynb
@@ -3,30 +3,38 @@
{
"cell_type": "code",
"execution_count": 1,
- "metadata": {},
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Requirement already satisfied: pandas in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (2.3.3)\n",
- "Requirement already satisfied: numpy in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (2.3.3)\n",
- "Requirement already satisfied: matplotlib in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (3.10.6)\n",
- "Requirement already satisfied: scikit-learn in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (1.7.2)\n",
- "Requirement already satisfied: python-dateutil>=2.8.2 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
- "Requirement already satisfied: pytz>=2020.1 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from pandas) (2025.2)\n",
- "Requirement already satisfied: tzdata>=2022.7 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from pandas) (2025.2)\n",
- "Requirement already satisfied: contourpy>=1.0.1 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (1.3.3)\n",
- "Requirement already satisfied: cycler>=0.10 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (0.12.1)\n",
- "Requirement already satisfied: fonttools>=4.22.0 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (4.60.1)\n",
- "Requirement already satisfied: kiwisolver>=1.3.1 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (1.4.9)\n",
- "Requirement already satisfied: packaging>=20.0 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (25.0)\n",
- "Requirement already satisfied: pillow>=8 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (11.3.0)\n",
- "Requirement already satisfied: pyparsing>=2.3.1 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from matplotlib) (3.2.5)\n",
- "Requirement already satisfied: scipy>=1.8.0 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from scikit-learn) (1.16.2)\n",
- "Requirement already satisfied: joblib>=1.2.0 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from scikit-learn) (1.5.2)\n",
- "Requirement already satisfied: threadpoolctl>=3.1.0 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from scikit-learn) (3.6.0)\n",
- "Requirement already satisfied: six>=1.5 in d:\\github\\1141-ml-data preprocessing\\.venv\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
+ "Requirement already satisfied: pandas in /home/codespace/.local/lib/python3.12/site-packages (2.3.1)\n",
+ "Requirement already satisfied: numpy in /home/codespace/.local/lib/python3.12/site-packages (2.3.1)\n",
+ "Requirement already satisfied: matplotlib in /home/codespace/.local/lib/python3.12/site-packages (3.10.3)\n",
+ "Requirement already satisfied: scikit-learn in /home/codespace/.local/lib/python3.12/site-packages (1.7.0)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2025.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2025.2)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (1.3.2)\n",
+ "Requirement already satisfied: cycler>=0.10 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (4.58.5)\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (1.4.8)\n",
+ "Requirement already satisfied: packaging>=20.0 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (25.0)\n",
+ "Requirement already satisfied: pillow>=8 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (11.3.0)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (3.2.3)\n",
+ "Requirement already satisfied: scipy>=1.8.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (1.16.0)\n",
+ "Requirement already satisfied: joblib>=1.2.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (1.5.1)\n",
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (3.6.0)\n",
+ "Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
@@ -54,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -223,7 +231,7 @@
"4 396.90 5.33 36.2 36.2 "
]
},
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -262,7 +270,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -281,14 +289,129 @@
"id": "-T2gXjrbwsLi",
"outputId": "279e371d-e3d4-4a13-eb87-f51892af6000"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1 | \n",
+ " 296.0 | \n",
+ " 15.3 | \n",
+ " 396.90 | \n",
+ " 4.98 | \n",
+ " 24.0 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 396.90 | \n",
+ " 9.14 | \n",
+ " 21.6 | \n",
+ " 21.6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 392.83 | \n",
+ " 4.03 | \n",
+ " 34.7 | \n",
+ " 34.7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 \n",
+ "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 \n",
+ "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 \n",
+ "\n",
+ " B LSTAT MEDV target \n",
+ "0 396.90 4.98 24.0 24.0 \n",
+ "1 396.90 9.14 21.6 21.6 \n",
+ "2 392.83 4.03 34.7 34.7 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 查看前三筆資料\n"
+ "# 查看前三筆資料\n",
+ "df.head(3)"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -307,14 +430,129 @@
"id": "3Uc2bIHWjGA-",
"outputId": "8c3d6b5d-1f81-4063-88c3-68adc956fc64"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 503 | \n",
+ " 0.06076 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.976 | \n",
+ " 91.0 | \n",
+ " 2.1675 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 396.90 | \n",
+ " 5.64 | \n",
+ " 23.9 | \n",
+ " 23.9 | \n",
+ "
\n",
+ " \n",
+ " | 504 | \n",
+ " 0.10959 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.794 | \n",
+ " 89.3 | \n",
+ " 2.3889 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 393.45 | \n",
+ " 6.48 | \n",
+ " 22.0 | \n",
+ " 22.0 | \n",
+ "
\n",
+ " \n",
+ " | 505 | \n",
+ " 0.04741 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.030 | \n",
+ " 80.8 | \n",
+ " 2.5050 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 396.90 | \n",
+ " 7.88 | \n",
+ " 11.9 | \n",
+ " 11.9 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 \n",
+ "504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 \n",
+ "505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 \n",
+ "\n",
+ " B LSTAT MEDV target \n",
+ "503 396.90 5.64 23.9 23.9 \n",
+ "504 393.45 6.48 22.0 22.0 \n",
+ "505 396.90 7.88 11.9 11.9 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 查看末三筆資料\n"
+ "# 查看末三筆資料\n",
+ "df.tail(3)"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -332,14 +570,26 @@
"id": "Z7tImjqTicrR",
"outputId": "a6665169-b8b9-4ca2-aecd-36bcc270a08a"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(506, 15)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 查看資料集大小 (列、欄)\n"
+ "# 查看資料集大小 (列、欄)\n",
+ "df.shape"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -357,14 +607,52 @@
"id": "Eyu5B1QhjIvG",
"outputId": "b55b94d1-96d0-4672-f6a9-4872a82e86a5"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 查看資料集的基本資訊\n"
+ "# 查看資料集的基本資訊\n",
+ "df.info"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -386,413 +674,49 @@
"outputs": [
{
"data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"CRIM\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 176.21241273856964,\n \"min\": 0.00632,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.613523557312254,\n 0.25651,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ZN\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 174.65631992520625,\n \"min\": 0.0,\n \"max\": 506.0,\n \"num_unique_values\": 6,\n \"samples\": [\n 506.0,\n 11.363636363636363,\n 100.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"INDUS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 175.10046881853455,\n \"min\": 0.46,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 11.13677865612648,\n 9.69,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"NOX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 178.71946937975397,\n \"min\": 0.11587767566755611,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 0.5546950592885376,\n 0.538,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"RM\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 176.99257138815915,\n \"min\": 0.7026171434153237,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 6.284634387351779,\n 6.2085,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"AGE\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 161.29423343904304,\n \"min\": 2.9,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 68.57490118577076,\n 77.5,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"DIS\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 177.4338019618181,\n \"min\": 1.1296,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 3.795042687747036,\n 3.2074499999999997,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"TAX\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 205.93933614417855,\n \"min\": 168.53711605495926,\n \"max\": 711.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 408.2371541501976,\n 330.0,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"PTRATIO\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 173.36059244426343,\n \"min\": 2.164945523714446,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 18.455533596837945,\n 19.05,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"B\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 173.23587641654387,\n \"min\": 0.32,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 356.6740316205534,\n 391.44,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LSTAT\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 174.45535325169888,\n \"min\": 1.73,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 12.653063241106722,\n 11.36,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"MEDV\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 171.852511161592,\n \"min\": 5.0,\n \"max\": 506.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 22.532806324110677,\n 21.2,\n 506.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
- "type": "dataframe"
- },
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " CRIM | \n",
- " ZN | \n",
- " INDUS | \n",
- " NOX | \n",
- " RM | \n",
- " AGE | \n",
- " DIS | \n",
- " TAX | \n",
- " PTRATIO | \n",
- " B | \n",
- " LSTAT | \n",
- " MEDV | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | count | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- " 506.000000 | \n",
- "
\n",
- " \n",
- " | mean | \n",
- " 3.613524 | \n",
- " 11.363636 | \n",
- " 11.136779 | \n",
- " 0.554695 | \n",
- " 6.284634 | \n",
- " 68.574901 | \n",
- " 3.795043 | \n",
- " 408.237154 | \n",
- " 18.455534 | \n",
- " 356.674032 | \n",
- " 12.653063 | \n",
- " 22.532806 | \n",
- "
\n",
- " \n",
- " | std | \n",
- " 8.601545 | \n",
- " 23.322453 | \n",
- " 6.860353 | \n",
- " 0.115878 | \n",
- " 0.702617 | \n",
- " 28.148861 | \n",
- " 2.105710 | \n",
- " 168.537116 | \n",
- " 2.164946 | \n",
- " 91.294864 | \n",
- " 7.141062 | \n",
- " 9.197104 | \n",
- "
\n",
- " \n",
- " | min | \n",
- " 0.006320 | \n",
- " 0.000000 | \n",
- " 0.460000 | \n",
- " 0.385000 | \n",
- " 3.561000 | \n",
- " 2.900000 | \n",
- " 1.129600 | \n",
- " 187.000000 | \n",
- " 12.600000 | \n",
- " 0.320000 | \n",
- " 1.730000 | \n",
- " 5.000000 | \n",
- "
\n",
- " \n",
- " | 25% | \n",
- " 0.082045 | \n",
- " 0.000000 | \n",
- " 5.190000 | \n",
- " 0.449000 | \n",
- " 5.885500 | \n",
- " 45.025000 | \n",
- " 2.100175 | \n",
- " 279.000000 | \n",
- " 17.400000 | \n",
- " 375.377500 | \n",
- " 6.950000 | \n",
- " 17.025000 | \n",
- "
\n",
- " \n",
- " | 50% | \n",
- " 0.256510 | \n",
- " 0.000000 | \n",
- " 9.690000 | \n",
- " 0.538000 | \n",
- " 6.208500 | \n",
- " 77.500000 | \n",
- " 3.207450 | \n",
- " 330.000000 | \n",
- " 19.050000 | \n",
- " 391.440000 | \n",
- " 11.360000 | \n",
- " 21.200000 | \n",
- "
\n",
- " \n",
- " | 75% | \n",
- " 3.677083 | \n",
- " 12.500000 | \n",
- " 18.100000 | \n",
- " 0.624000 | \n",
- " 6.623500 | \n",
- " 94.075000 | \n",
- " 5.188425 | \n",
- " 666.000000 | \n",
- " 20.200000 | \n",
- " 396.225000 | \n",
- " 16.955000 | \n",
- " 25.000000 | \n",
- "
\n",
- " \n",
- " | max | \n",
- " 88.976200 | \n",
- " 100.000000 | \n",
- " 27.740000 | \n",
- " 0.871000 | \n",
- " 8.780000 | \n",
- " 100.000000 | \n",
- " 12.126500 | \n",
- " 711.000000 | \n",
- " 22.000000 | \n",
- " 396.900000 | \n",
- " 37.970000 | \n",
- " 50.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- "
\n"
- ],
"text/plain": [
- " CRIM ZN INDUS NOX RM AGE \\\n",
- "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n",
- "mean 3.613524 11.363636 11.136779 0.554695 6.284634 68.574901 \n",
- "std 8.601545 23.322453 6.860353 0.115878 0.702617 28.148861 \n",
- "min 0.006320 0.000000 0.460000 0.385000 3.561000 2.900000 \n",
- "25% 0.082045 0.000000 5.190000 0.449000 5.885500 45.025000 \n",
- "50% 0.256510 0.000000 9.690000 0.538000 6.208500 77.500000 \n",
- "75% 3.677083 12.500000 18.100000 0.624000 6.623500 94.075000 \n",
- "max 88.976200 100.000000 27.740000 0.871000 8.780000 100.000000 \n",
+ ""
]
},
- "execution_count": 51,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# 查看數值欄位的敘述性統計\n"
+ "# 查看數值欄位的敘述性統計\n",
+ "df.describe"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -811,9 +735,33 @@
"id": "Tz47vFk0jdDH",
"outputId": "90e8f49c-2417-4a09-ad3f-c8aafc4a4794"
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CRIM\n",
+ "0.01501 2\n",
+ "14.33370 2\n",
+ "0.03466 1\n",
+ "0.05083 1\n",
+ "0.03738 1\n",
+ " ..\n",
+ "1.27346 1\n",
+ "1.42502 1\n",
+ "1.34284 1\n",
+ "1.22358 1\n",
+ "0.13914 1\n",
+ "Name: count, Length: 504, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# 查看類別欄位 'MEDV' 中各類別的出現次數\n"
+ "# 查看類別欄位 'MEDV' 中各類別的出現次數\n",
+ "df['CRIM'].value_counts()"
]
},
{
@@ -847,7 +795,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -910,7 +858,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -953,7 +901,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1004,7 +952,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1167,7 +1115,7 @@
"4 396.90 5.33 36.2 "
]
},
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -1183,7 +1131,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1277,11 +1225,11 @@
"print(\"\\n平均值填補 CRIM 欄位:\\n\",df)\n",
"\n",
"# 中位數填補 RM 欄位\n",
- "# df['RM'] = ...\n",
+ "df['RM'] = df['RM'].fillna(df['RM'].median())\n",
"print(\"\\n中位數填補 RM 欄位:\\n\",df)\n",
"\n",
"# 眾數填補 AGE 欄位\n",
- "# df['AGE'] = ...\n",
+ "df['AGE'] = df['AGE'].fillna(df['AGE'].mode()[0])\n",
"print(\"\\n眾數填補 AGE 欄位:\\n\",df)\n",
"\n",
"print(\"\\n填補後的資料:\")\n",
@@ -1299,7 +1247,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 18,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1321,7 +1269,7 @@
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -1355,7 +1303,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1377,7 +1325,7 @@
"outputs": [
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
""
]
@@ -1407,7 +1355,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -1697,7 +1645,7 @@
"[66 rows x 14 columns]"
]
},
- "execution_count": 14,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -1735,7 +1683,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -1758,30 +1706,40 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "TAX 欄位的 IQR 上限: 1246.5, 下限: -301.5\n",
+ "RM 欄位的 IQR 上限: 7.730500000000001, 下限: 4.778499999999999\n",
"\n",
- "TAX 欄位的異常值數量: 0\n",
+ "RM 欄位的異常值數量: 30\n",
"\n",
"部分異常值資料點:\n",
- " Empty DataFrame\n",
- "Columns: [CRIM, ZN, INDUS, CHAS, NOX, RM, AGE, DIS, RAD, TAX, PTRATIO, B, LSTAT, MEDV]\n",
- "Index: []\n"
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "97 0.12083 0.0 2.89 0 0.445 8.069 76.0 3.4952 2 276.0 18.0 \n",
+ "98 0.08187 0.0 2.89 0 0.445 7.820 36.9 3.4952 2 276.0 18.0 \n",
+ "162 1.83377 0.0 19.58 1 0.605 7.802 98.2 2.0407 5 403.0 14.7 \n",
+ "163 1.51902 0.0 19.58 1 0.605 8.375 93.9 2.1620 5 403.0 14.7 \n",
+ "166 2.01019 0.0 19.58 0 0.605 7.929 96.2 2.0459 5 403.0 14.7 \n",
+ "\n",
+ " B LSTAT MEDV \n",
+ "97 396.90 4.21 38.7 \n",
+ "98 393.53 3.57 43.8 \n",
+ "162 389.61 1.92 50.0 \n",
+ "163 388.45 3.32 50.0 \n",
+ "166 369.30 3.70 50.0 \n"
]
}
],
"source": [
"# 任務 1 & 2\n",
- "selected_column = 'TAX' # 或其他欄位\n",
+ "selected_column = 'RM' # 或其他欄位\n",
"col_desc = df[selected_column].describe()\n",
- "# Q1 = ...\n",
- "# Q3 = ...\n",
- "# IQR = ...\n",
- "# lower_bound = ...\n",
- "# upper_bound = ...\n",
+ "Q1 = col_desc['25%']\n",
+ "Q3 = col_desc['75%']\n",
+ "IQR = Q3 - Q1\n",
+ "lower_bound = Q1 - 1.5 * IQR\n",
+ "upper_bound = Q3 + 1.5 * IQR\n",
"print(f\"{selected_column} 欄位的 IQR 上限: {upper_bound}, 下限: {lower_bound}\\n\")\n",
"\n",
"# 任務 3\n",
- "# outliers = ...\n",
+ "outliers = df[(df[selected_column] < lower_bound) | (df[selected_column] > upper_bound)]\n",
"print(f\"{selected_column} 欄位的異常值數量: {len(outliers)}\\n\")\n",
"print(\"部分異常值資料點:\\n\", outliers.head())"
]
@@ -1795,7 +1753,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 22,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -2181,7 +2139,7 @@
"[288 rows x 29 columns]"
]
},
- "execution_count": 15,
+ "execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@@ -2231,7 +2189,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 23,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -2300,6 +2258,13 @@
"print(\"\\n移除重複後的資料:\")\n",
"print(df_cleaned)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -2307,7 +2272,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": ".venv",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -2321,9 +2286,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.13.6"
+ "version": "3.12.1"
}
},
"nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 4
}
From 7027854f8959e25d993b35ad8009cde9d8c84fd6 Mon Sep 17 00:00:00 2001
From: happpycorn <135821359+happpycorn@users.noreply.github.com>
Date: Thu, 9 Oct 2025 02:49:50 +0000
Subject: [PATCH 2/2] other
---
.../ML_data_preprocessing-checkpoint.ipynb | 2294 +++++++++++++++++
1 file changed, 2294 insertions(+)
create mode 100644 .ipynb_checkpoints/ML_data_preprocessing-checkpoint.ipynb
diff --git a/.ipynb_checkpoints/ML_data_preprocessing-checkpoint.ipynb b/.ipynb_checkpoints/ML_data_preprocessing-checkpoint.ipynb
new file mode 100644
index 0000000..7ccef6f
--- /dev/null
+++ b/.ipynb_checkpoints/ML_data_preprocessing-checkpoint.ipynb
@@ -0,0 +1,2294 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true,
+ "jupyter": {
+ "outputs_hidden": true
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: pandas in /home/codespace/.local/lib/python3.12/site-packages (2.3.1)\n",
+ "Requirement already satisfied: numpy in /home/codespace/.local/lib/python3.12/site-packages (2.3.1)\n",
+ "Requirement already satisfied: matplotlib in /home/codespace/.local/lib/python3.12/site-packages (3.10.3)\n",
+ "Requirement already satisfied: scikit-learn in /home/codespace/.local/lib/python3.12/site-packages (1.7.0)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2025.2)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2025.2)\n",
+ "Requirement already satisfied: contourpy>=1.0.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (1.3.2)\n",
+ "Requirement already satisfied: cycler>=0.10 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (0.12.1)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (4.58.5)\n",
+ "Requirement already satisfied: kiwisolver>=1.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (1.4.8)\n",
+ "Requirement already satisfied: packaging>=20.0 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (25.0)\n",
+ "Requirement already satisfied: pillow>=8 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (11.3.0)\n",
+ "Requirement already satisfied: pyparsing>=2.3.1 in /home/codespace/.local/lib/python3.12/site-packages (from matplotlib) (3.2.3)\n",
+ "Requirement already satisfied: scipy>=1.8.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (1.16.0)\n",
+ "Requirement already satisfied: joblib>=1.2.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (1.5.1)\n",
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /home/codespace/.local/lib/python3.12/site-packages (from scikit-learn) (3.6.0)\n",
+ "Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n",
+ "\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.2\u001b[0m\n",
+ "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install pandas numpy matplotlib scikit-learn"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bGPLRqx2-Of2"
+ },
+ "source": [
+ "# 載入與探索資料 (使用 Pandas 與 scikit-learn)\n",
+ "* 我們常用 Pandas DataFrame 來處理結構化的資料\n",
+ "\n",
+ "* 載入 CSV 檔案: 使用 pd.read_csv()\n",
+ "\n",
+ "* 載入 scikit-learn 內建資料集: scikit-learn 提供了一些範例資料集 (如波士頓房價資料集),這些資料集載入後通常是 dictionary 格式\n",
+ "\n",
+ "* 可以透過 `.keys()` 查看包含哪些內容 (如 data, target, feature_names, description)\n",
+ "* 需要將其轉換為 DataFrame 格式以便操作"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "executionInfo": {
+ "elapsed": 59,
+ "status": "ok",
+ "timestamp": 1759734207925,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "9TwEcXWpw-wY",
+ "outputId": "e98e4a98-e588-4101-bc17-ddaadc5fbbca"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1 | \n",
+ " 296.0 | \n",
+ " 15.3 | \n",
+ " 396.90 | \n",
+ " 4.98 | \n",
+ " 24.0 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 396.90 | \n",
+ " 9.14 | \n",
+ " 21.6 | \n",
+ " 21.6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 392.83 | \n",
+ " 4.03 | \n",
+ " 34.7 | \n",
+ " 34.7 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.03237 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 6.998 | \n",
+ " 45.8 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 394.63 | \n",
+ " 2.94 | \n",
+ " 33.4 | \n",
+ " 33.4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.06905 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 7.147 | \n",
+ " 54.2 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 396.90 | \n",
+ " 5.33 | \n",
+ " 36.2 | \n",
+ " 36.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 \n",
+ "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 \n",
+ "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 \n",
+ "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 \n",
+ "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 \n",
+ "\n",
+ " B LSTAT MEDV target \n",
+ "0 396.90 4.98 24.0 24.0 \n",
+ "1 396.90 9.14 21.6 21.6 \n",
+ "2 392.83 4.03 34.7 34.7 \n",
+ "3 394.63 2.94 33.4 33.4 \n",
+ "4 396.90 5.33 36.2 36.2 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 載入必要的套件、從 scikit-learn 套件中匯入 fetch_openml函式來從 OpenML 平台(一個公開的資料集儲存庫)下載各種資料集。\n",
+ "# name='boston':指定要下載的資料集名稱。version=1:指定版本號。\n",
+ "# as_frame=True:預設值為False True讓輸出的資料以 pandas DataFrame 格式呈現;False則是傳回 numpy 陣列\n",
+ "# fetch_openml() 回傳一個 Bunch 物件(類似字典)\n",
+ "#boston.frame:當 as_frame=True 時,包含所有欄位的 DataFrame。\n",
+ "import pandas as pd\n",
+ "from sklearn.datasets import fetch_openml\n",
+ "boston = fetch_openml(name='boston', version=1, as_frame=True)\n",
+ "df = boston.frame\n",
+ "df[\"target\"]=boston.target\n",
+ "df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Yte7XqeCiFy_"
+ },
+ "source": [
+ "# 課堂練習: 探索 DataFrame(上一周上課內容複習)\n",
+ "* 查看前/後幾筆資料:df.head() / df.tail() (預設顯示前/後 5 筆,可指定數字)\n",
+ "\n",
+ "* 查看資料的維度 (形狀):df.shape (回傳 (列數, 欄位數))\n",
+ "\n",
+ "* 查看資料的基本資訊 (欄位數、資料筆數、是否有缺失值、資料型態):df.info()\n",
+ "\n",
+ "* 查看數值欄位的敘述性統計 (平均值、標準差、最小值、最大值、四分位數等):df.describe()\n",
+ "\n",
+ "* 查看類別欄位中各類別的出現次數:df['欄位名稱'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 143
+ },
+ "executionInfo": {
+ "elapsed": 59,
+ "status": "ok",
+ "timestamp": 1759734496497,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "-T2gXjrbwsLi",
+ "outputId": "279e371d-e3d4-4a13-eb87-f51892af6000"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1 | \n",
+ " 296.0 | \n",
+ " 15.3 | \n",
+ " 396.90 | \n",
+ " 4.98 | \n",
+ " 24.0 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 396.90 | \n",
+ " 9.14 | \n",
+ " 21.6 | \n",
+ " 21.6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 392.83 | \n",
+ " 4.03 | \n",
+ " 34.7 | \n",
+ " 34.7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 \n",
+ "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 \n",
+ "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 \n",
+ "\n",
+ " B LSTAT MEDV target \n",
+ "0 396.90 4.98 24.0 24.0 \n",
+ "1 396.90 9.14 21.6 21.6 \n",
+ "2 392.83 4.03 34.7 34.7 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看前三筆資料\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 143
+ },
+ "executionInfo": {
+ "elapsed": 43,
+ "status": "ok",
+ "timestamp": 1759734499990,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "3Uc2bIHWjGA-",
+ "outputId": "8c3d6b5d-1f81-4063-88c3-68adc956fc64"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ " target | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 503 | \n",
+ " 0.06076 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.976 | \n",
+ " 91.0 | \n",
+ " 2.1675 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 396.90 | \n",
+ " 5.64 | \n",
+ " 23.9 | \n",
+ " 23.9 | \n",
+ "
\n",
+ " \n",
+ " | 504 | \n",
+ " 0.10959 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.794 | \n",
+ " 89.3 | \n",
+ " 2.3889 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 393.45 | \n",
+ " 6.48 | \n",
+ " 22.0 | \n",
+ " 22.0 | \n",
+ "
\n",
+ " \n",
+ " | 505 | \n",
+ " 0.04741 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.030 | \n",
+ " 80.8 | \n",
+ " 2.5050 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " 21.0 | \n",
+ " 396.90 | \n",
+ " 7.88 | \n",
+ " 11.9 | \n",
+ " 11.9 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 21.0 \n",
+ "504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 21.0 \n",
+ "505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 21.0 \n",
+ "\n",
+ " B LSTAT MEDV target \n",
+ "503 396.90 5.64 23.9 23.9 \n",
+ "504 393.45 6.48 22.0 22.0 \n",
+ "505 396.90 7.88 11.9 11.9 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看末三筆資料\n",
+ "df.tail(3)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 11,
+ "status": "ok",
+ "timestamp": 1759734504022,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "Z7tImjqTicrR",
+ "outputId": "a6665169-b8b9-4ca2-aecd-36bcc270a08a"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(506, 15)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看資料集大小 (列、欄)\n",
+ "df.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 21,
+ "status": "ok",
+ "timestamp": 1750840600233,
+ "user": {
+ "displayName": "chen nicole",
+ "userId": "10741717251477288554"
+ },
+ "user_tz": -480
+ },
+ "id": "Eyu5B1QhjIvG",
+ "outputId": "b55b94d1-96d0-4672-f6a9-4872a82e86a5"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看資料集的基本資訊\n",
+ "df.info"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 300
+ },
+ "executionInfo": {
+ "elapsed": 87,
+ "status": "ok",
+ "timestamp": 1750840601682,
+ "user": {
+ "displayName": "chen nicole",
+ "userId": "10741717251477288554"
+ },
+ "user_tz": -480
+ },
+ "id": "FoPykZm_jTGe",
+ "outputId": "822d2392-4326-4e17-96fa-b05423d1df67"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看數值欄位的敘述性統計\n",
+ "df.describe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 489
+ },
+ "executionInfo": {
+ "elapsed": 51,
+ "status": "ok",
+ "timestamp": 1759734719612,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "Tz47vFk0jdDH",
+ "outputId": "90e8f49c-2417-4a09-ad3f-c8aafc4a4794"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CRIM\n",
+ "0.01501 2\n",
+ "14.33370 2\n",
+ "0.03466 1\n",
+ "0.05083 1\n",
+ "0.03738 1\n",
+ " ..\n",
+ "1.27346 1\n",
+ "1.42502 1\n",
+ "1.34284 1\n",
+ "1.22358 1\n",
+ "0.13914 1\n",
+ "Name: count, Length: 504, dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 查看類別欄位 'MEDV' 中各類別的出現次數\n",
+ "df['CRIM'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qlxCTqXzsAhi"
+ },
+ "source": [
+ "# 資料清理 - 處理遺失值 (Missing Values)\n",
+ "\n",
+ "* 使用 `df.info()` 可以快速查看每個欄位非空值的數量,從而推算遺失值.\n",
+ "* 使用 `df.isnull()` 或 `df.isna()`:回傳一個與 DataFrame 形狀相同的 boolean DataFrame,True 表示該位置是遺失值 (NaN),False 表示非遺失值.\n",
+ "* 搭配 .sum():`df.isnull().sum()` 可以快速計算每個欄位的遺失值數量.\n",
+ "* 搭配 .any():`df.isnull().any()` 可以快速判斷哪些欄位包含遺失值.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VeCLIqzSskcs"
+ },
+ "source": [
+ "# 課堂練習:刪除缺失值\n",
+ "\n",
+ "* 任務: 創建一個新的 DataFrame,其中包含幾列和幾行,並手動將一些值設為 np.nan。\n",
+ "\n",
+ "然後嘗試使用 `df.dropna()` 刪除包含遺失值的列。\n",
+ "\n",
+ "觀察刪除前後 DataFrame 的形狀 (.shape) 變化。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 50,
+ "status": "ok",
+ "timestamp": 1759735067024,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "HF5fimpQsk3b",
+ "outputId": "7c701a7f-f4db-4bd5-f634-66fdfe3f9b83"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "原始 DataFrame:\n",
+ " col1 col2 col3\n",
+ "0 1.0 5.0 9\n",
+ "1 2.0 NaN 10\n",
+ "2 NaN NaN 11\n",
+ "3 4.0 8.0 12\n",
+ "原始形狀: (4, 3)\n",
+ "\n",
+ "刪除遺失值後的 DataFrame:\n",
+ " col1 col2 col3\n",
+ "0 1.0 5.0 9\n",
+ "3 4.0 8.0 12\n",
+ "刪除遺失值後的形狀: (2, 3)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# 創建範例 DataFrame\n",
+ "data = {\n",
+ " 'col1': [1, 2, np.nan, 4],\n",
+ " 'col2': [5, np.nan, np.nan, 8],\n",
+ " 'col3': [9, 10, 11, 12] # 補上 col3 的資料\n",
+ "}\n",
+ "test_df = pd.DataFrame(data)\n",
+ "\n",
+ "# 觀察原始 DataFrame\n",
+ "print(\"原始 DataFrame:\\n\", test_df)\n",
+ "print(\"原始形狀:\", test_df.shape)\n",
+ "\n",
+ "# 處理缺失值後再次觀察\n",
+ "test_df_dropped = test_df.dropna() # 刪除包含 NaN 的列\n",
+ "print(\"\\n刪除遺失值後的 DataFrame:\\n\", test_df_dropped)\n",
+ "print(\"刪除遺失值後的形狀:\", test_df_dropped.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 41,
+ "status": "ok",
+ "timestamp": 1759735290075,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "hs0fa40MZL7l",
+ "outputId": "d3cc8907-8db2-4a6e-9205-6820a66ddaa0"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "刪除遺失值後的 DataFrame:\n",
+ " col3\n",
+ "0 9\n",
+ "1 10\n",
+ "2 11\n",
+ "3 12\n",
+ "刪除遺失值後的形狀: (4, 1)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 處理缺失值後再次觀察\n",
+ "test_df_dropped_column = test_df.dropna(axis=1) # 刪除包含 NaN 的欄\n",
+ "print(\"\\n刪除遺失值後的 DataFrame:\\n\", test_df_dropped_column)\n",
+ "print(\"刪除遺失值後的形狀:\", test_df_dropped_column.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 13,
+ "status": "ok",
+ "timestamp": 1759735388528,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "mj_z0Q-8Zd2v",
+ "outputId": "3e1aacb8-96af-46aa-fa6d-9b8515723844"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "刪除遺失值後的 DataFrame:\n",
+ " col1 col2 col3\n",
+ "0 1.0 5.0 9\n",
+ "1 2.0 NaN 10\n",
+ "3 4.0 8.0 12\n",
+ "刪除遺失值後的形狀: (3, 3)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 處理缺失值後再次觀察\n",
+ "test_df_dropped_thresh2 = test_df.dropna(thresh=2) #刪除遺失值數量超過 N 的列\n",
+ "print(\"\\n刪除遺失值後的 DataFrame:\\n\", test_df_dropped_thresh2)\n",
+ "print(\"刪除遺失值後的形狀:\", test_df_dropped_thresh2.shape)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "49GtgmqdzJcf"
+ },
+ "source": [
+ "# 課堂練習: 填補缺失值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "executionInfo": {
+ "elapsed": 70,
+ "status": "ok",
+ "timestamp": 1750840650396,
+ "user": {
+ "displayName": "chen nicole",
+ "userId": "10741717251477288554"
+ },
+ "user_tz": -480
+ },
+ "id": "X8j4lT_YPBOp",
+ "outputId": "f8ffc8ac-f26c-4c14-bfa0-6c469c9ff9c0"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1 | \n",
+ " 296.0 | \n",
+ " 15.3 | \n",
+ " 396.90 | \n",
+ " 4.98 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 396.90 | \n",
+ " 9.14 | \n",
+ " 21.6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " 17.8 | \n",
+ " 392.83 | \n",
+ " 4.03 | \n",
+ " 34.7 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.03237 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 6.998 | \n",
+ " 45.8 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 394.63 | \n",
+ " 2.94 | \n",
+ " 33.4 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.06905 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 7.147 | \n",
+ " 54.2 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " 18.7 | \n",
+ " 396.90 | \n",
+ " 5.33 | \n",
+ " 36.2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 15.3 \n",
+ "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 17.8 \n",
+ "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 17.8 \n",
+ "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 18.7 \n",
+ "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 18.7 \n",
+ "\n",
+ " B LSTAT MEDV \n",
+ "0 396.90 4.98 24.0 \n",
+ "1 396.90 9.14 21.6 \n",
+ "2 392.83 4.03 34.7 \n",
+ "3 394.63 2.94 33.4 \n",
+ "4 396.90 5.33 36.2 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ " # 載入必要的套件\n",
+ "import pandas as pd\n",
+ "from sklearn.datasets import fetch_openml\n",
+ "boston = fetch_openml(name='boston', version=1, as_frame=True)\n",
+ "df = boston.frame\n",
+ "df.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 62,
+ "status": "ok",
+ "timestamp": 1759737346289,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "pvQAcwzvzJxh",
+ "outputId": "cd903fe8-3024-4206-aa27-d74f4463f0cf"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "原始資料:\n",
+ " CRIM RM AGE\n",
+ "0 0.1 6.5 65.0\n",
+ "1 NaN 7.1 72.0\n",
+ "2 0.3 NaN NaN\n",
+ "3 0.2 5.9 65.0\n",
+ "查看資料分布:\n",
+ " CRIM RM AGE\n",
+ "count 3.00 3.0 3.000000\n",
+ "mean 0.20 6.5 67.333333\n",
+ "std 0.10 0.6 4.041452\n",
+ "min 0.10 5.9 65.000000\n",
+ "25% 0.15 6.2 65.000000\n",
+ "50% 0.20 6.5 65.000000\n",
+ "75% 0.25 6.8 68.500000\n",
+ "max 0.30 7.1 72.000000\n",
+ "\n",
+ "平均值填補 CRIM 欄位:\n",
+ " CRIM RM AGE\n",
+ "0 0.1 6.5 65.0\n",
+ "1 0.2 7.1 72.0\n",
+ "2 0.3 NaN NaN\n",
+ "3 0.2 5.9 65.0\n",
+ "\n",
+ "中位數填補 RM 欄位:\n",
+ " CRIM RM AGE\n",
+ "0 0.1 6.5 65.0\n",
+ "1 0.2 7.1 72.0\n",
+ "2 0.3 6.5 NaN\n",
+ "3 0.2 5.9 65.0\n",
+ "\n",
+ "眾數填補 AGE 欄位:\n",
+ " CRIM RM AGE\n",
+ "0 0.1 6.5 65.0\n",
+ "1 0.2 7.1 72.0\n",
+ "2 0.3 6.5 65.0\n",
+ "3 0.2 5.9 65.0\n",
+ "\n",
+ "填補後的資料:\n",
+ " CRIM RM AGE\n",
+ "0 0.1 6.5 65.0\n",
+ "1 0.2 7.1 72.0\n",
+ "2 0.3 6.5 65.0\n",
+ "3 0.2 5.9 65.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "#處理遺失值\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "# 建立模擬資料\n",
+ "data = {\n",
+ " 'CRIM': [0.1, np.nan, 0.3, 0.2],\n",
+ " 'RM': [6.5, 7.1, np.nan, 5.9],\n",
+ " 'AGE': [65, 72, np.nan, 65]\n",
+ "}\n",
+ "\n",
+ "df = pd.DataFrame(data)\n",
+ "\n",
+ "print(\"原始資料:\")\n",
+ "print(df)\n",
+ "#查看資料\n",
+ "print(\"查看資料分布:\\n\",df.describe())\n",
+ "\n",
+ "# 平均值填補 CRIM 欄位\n",
+ "df['CRIM'] = df['CRIM'].fillna(df['CRIM'].mean())\n",
+ "print(\"\\n平均值填補 CRIM 欄位:\\n\",df)\n",
+ "\n",
+ "# 中位數填補 RM 欄位\n",
+ "df['RM'] = df['RM'].fillna(df['RM'].median())\n",
+ "print(\"\\n中位數填補 RM 欄位:\\n\",df)\n",
+ "\n",
+ "# 眾數填補 AGE 欄位\n",
+ "df['AGE'] = df['AGE'].fillna(df['AGE'].mode()[0])\n",
+ "print(\"\\n眾數填補 AGE 欄位:\\n\",df)\n",
+ "\n",
+ "print(\"\\n填補後的資料:\")\n",
+ "print(df)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "P2PPz4z25YDS"
+ },
+ "source": [
+ "# 課堂範例: 處理異常值(接續波士頓房價資料集)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "executionInfo": {
+ "elapsed": 291,
+ "status": "ok",
+ "timestamp": 1759738143458,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "gTxpd_CFPFHK",
+ "outputId": "5f09f83f-95ae-4d6a-f21c-68a35d52e55a"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ " # 載入必要的套件\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.datasets import fetch_openml\n",
+ "boston = fetch_openml(name='boston', version=1, as_frame=True)\n",
+ "df = boston.frame\n",
+ "df.head(5)\n",
+ "\n",
+ "\n",
+ "# 只選擇數值型欄位\n",
+ "numeric_cols = df.select_dtypes(include='number').columns.tolist()\n",
+ "\n",
+ "# 畫出這些欄位的 boxplot\n",
+ "plt.figure(figsize=(max(12, len(numeric_cols) * 0.8), 6))\n",
+ "plt.boxplot(df[numeric_cols].values, tick_labels=numeric_cols, showfliers=True)\n",
+ "\n",
+ "plt.xticks(rotation=45, ha=\"right\")\n",
+ "plt.title(\"Boston Housing Dataset – Box Plots of Numerical Features\")\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 753
+ },
+ "executionInfo": {
+ "elapsed": 178,
+ "status": "ok",
+ "timestamp": 1759739619206,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "cRnRvCFziONd",
+ "outputId": "aff4d629-f559-4586-d1c8-37ae1482c4f6"
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# 畫出這些欄位的 boxplot\n",
+ "# CRIM: 有非常多的點遠高於上界,表示有些地區的犯罪率極高。每人平均的城鎮犯罪率(per capita crime rate by town)\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.datasets import fetch_openml\n",
+ "boston = fetch_openml(name='boston', version=1, as_frame=True)\n",
+ "df = boston.frame\n",
+ "#df.head(5)\n",
+ "#print(df[\"CRIM\"].values,\"type:\",type(df[\"CRIM\"].values))\n",
+ "plt.figure(figsize=(max(12, len(numeric_cols) * 0.8), 6))\n",
+ "plt.boxplot(df[\"CRIM\"].values, tick_labels=[\"CRIM\"])\n",
+ "\n",
+ "plt.xticks(rotation=45, ha=\"right\")\n",
+ "plt.title(\"Boston Housing Dataset – Box Plots [CRIM]\")\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 496
+ },
+ "executionInfo": {
+ "elapsed": 15,
+ "status": "ok",
+ "timestamp": 1759739438598,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "mgjjj1EG5bBS",
+ "outputId": "e5487e9a-c071-48b7-f06a-2175420e4230"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "lower_bound: -5.31051125 upper_bound: 9.06963875\n",
+ "CRIM 欄位的異常值數量: 66\n",
+ "outliers data:\n",
+ "\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " PTRATIO | \n",
+ " B | \n",
+ " LSTAT | \n",
+ " MEDV | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 367 | \n",
+ " 13.5222 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.631 | \n",
+ " 3.863 | \n",
+ " 100.0 | \n",
+ " 1.5106 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 131.42 | \n",
+ " 13.33 | \n",
+ " 23.1 | \n",
+ "
\n",
+ " \n",
+ " | 371 | \n",
+ " 9.2323 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.631 | \n",
+ " 6.216 | \n",
+ " 100.0 | \n",
+ " 1.1691 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 366.15 | \n",
+ " 9.53 | \n",
+ " 50.0 | \n",
+ "
\n",
+ " \n",
+ " | 373 | \n",
+ " 11.1081 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.668 | \n",
+ " 4.906 | \n",
+ " 100.0 | \n",
+ " 1.1742 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 396.90 | \n",
+ " 34.77 | \n",
+ " 13.8 | \n",
+ "
\n",
+ " \n",
+ " | 374 | \n",
+ " 18.4982 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.668 | \n",
+ " 4.138 | \n",
+ " 100.0 | \n",
+ " 1.1370 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 396.90 | \n",
+ " 37.97 | \n",
+ " 13.8 | \n",
+ "
\n",
+ " \n",
+ " | 375 | \n",
+ " 19.6091 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.671 | \n",
+ " 7.313 | \n",
+ " 97.9 | \n",
+ " 1.3163 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 396.90 | \n",
+ " 13.44 | \n",
+ " 15.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 468 | \n",
+ " 15.5757 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.580 | \n",
+ " 5.926 | \n",
+ " 71.0 | \n",
+ " 2.9084 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 368.74 | \n",
+ " 18.13 | \n",
+ " 19.1 | \n",
+ "
\n",
+ " \n",
+ " | 469 | \n",
+ " 13.0751 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.580 | \n",
+ " 5.713 | \n",
+ " 56.7 | \n",
+ " 2.8237 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 396.90 | \n",
+ " 14.76 | \n",
+ " 20.1 | \n",
+ "
\n",
+ " \n",
+ " | 477 | \n",
+ " 15.0234 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.614 | \n",
+ " 5.304 | \n",
+ " 97.3 | \n",
+ " 2.1007 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 349.48 | \n",
+ " 24.91 | \n",
+ " 12.0 | \n",
+ "
\n",
+ " \n",
+ " | 478 | \n",
+ " 10.2330 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.614 | \n",
+ " 6.185 | \n",
+ " 96.7 | \n",
+ " 2.1705 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 379.70 | \n",
+ " 18.03 | \n",
+ " 14.6 | \n",
+ "
\n",
+ " \n",
+ " | 479 | \n",
+ " 14.3337 | \n",
+ " 0.0 | \n",
+ " 18.1 | \n",
+ " 0 | \n",
+ " 0.614 | \n",
+ " 6.229 | \n",
+ " 88.0 | \n",
+ " 1.9512 | \n",
+ " 24 | \n",
+ " 666.0 | \n",
+ " 20.2 | \n",
+ " 383.32 | \n",
+ " 13.11 | \n",
+ " 21.4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
66 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
+ "367 13.5222 0.0 18.1 0 0.631 3.863 100.0 1.5106 24 666.0 \n",
+ "371 9.2323 0.0 18.1 0 0.631 6.216 100.0 1.1691 24 666.0 \n",
+ "373 11.1081 0.0 18.1 0 0.668 4.906 100.0 1.1742 24 666.0 \n",
+ "374 18.4982 0.0 18.1 0 0.668 4.138 100.0 1.1370 24 666.0 \n",
+ "375 19.6091 0.0 18.1 0 0.671 7.313 97.9 1.3163 24 666.0 \n",
+ ".. ... ... ... ... ... ... ... ... .. ... \n",
+ "468 15.5757 0.0 18.1 0 0.580 5.926 71.0 2.9084 24 666.0 \n",
+ "469 13.0751 0.0 18.1 0 0.580 5.713 56.7 2.8237 24 666.0 \n",
+ "477 15.0234 0.0 18.1 0 0.614 5.304 97.3 2.1007 24 666.0 \n",
+ "478 10.2330 0.0 18.1 0 0.614 6.185 96.7 2.1705 24 666.0 \n",
+ "479 14.3337 0.0 18.1 0 0.614 6.229 88.0 1.9512 24 666.0 \n",
+ "\n",
+ " PTRATIO B LSTAT MEDV \n",
+ "367 20.2 131.42 13.33 23.1 \n",
+ "371 20.2 366.15 9.53 50.0 \n",
+ "373 20.2 396.90 34.77 13.8 \n",
+ "374 20.2 396.90 37.97 13.8 \n",
+ "375 20.2 396.90 13.44 15.0 \n",
+ ".. ... ... ... ... \n",
+ "468 20.2 368.74 18.13 19.1 \n",
+ "469 20.2 396.90 14.76 20.1 \n",
+ "477 20.2 349.48 24.91 12.0 \n",
+ "478 20.2 379.70 18.03 14.6 \n",
+ "479 20.2 383.32 13.11 21.4 \n",
+ "\n",
+ "[66 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 使用 IQR 方法偵測異常值 (以波士頓房價資料集的 'CRIM' 欄位為例)\n",
+ "# 首先使用 describe() 獲取四分位數\n",
+ "desc = df['CRIM'].describe()\n",
+ "Q1 = desc['25%']\n",
+ "Q3 = desc['75%']\n",
+ "IQR = Q3 - Q1\n",
+ "lower_bound = Q1 - 1.5 * IQR\n",
+ "upper_bound = Q3 + 1.5 * IQR\n",
+ "print(\"lower_bound:\",lower_bound,\"upper_bound:\",upper_bound)\n",
+ "# 找出異常值\n",
+ "outliers = df[(df['CRIM'] < lower_bound) | (df['CRIM'] > upper_bound)]\n",
+ "\n",
+ "print(\"CRIM 欄位的異常值數量:\", len(outliers))\n",
+ "print(\"outliers data:\\n\")\n",
+ "outliers"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ho7jb66g4vCI"
+ },
+ "source": [
+ "# 課堂練習:處理異常值(接續波士頓房價資料集)\n",
+ "\n",
+ "1. 任務: 繼續使用波士頓房價資料集 (df),偵測其他欄位(例如:'TAX' 或 'B')的異常值。\n",
+ "2. 任務: 使用四分位距 (IQR) 方法,計算該欄位的異常值上下限。\n",
+ "3. 任務: 找出並列出該欄位中被判定為異常值的資料點數量\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 44,
+ "status": "ok",
+ "timestamp": 1759739862777,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "8rOk-tX64vVc",
+ "outputId": "7f9e2837-2587-4ae5-823b-1d857228b6eb"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RM 欄位的 IQR 上限: 7.730500000000001, 下限: 4.778499999999999\n",
+ "\n",
+ "RM 欄位的異常值數量: 30\n",
+ "\n",
+ "部分異常值資料點:\n",
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO \\\n",
+ "97 0.12083 0.0 2.89 0 0.445 8.069 76.0 3.4952 2 276.0 18.0 \n",
+ "98 0.08187 0.0 2.89 0 0.445 7.820 36.9 3.4952 2 276.0 18.0 \n",
+ "162 1.83377 0.0 19.58 1 0.605 7.802 98.2 2.0407 5 403.0 14.7 \n",
+ "163 1.51902 0.0 19.58 1 0.605 8.375 93.9 2.1620 5 403.0 14.7 \n",
+ "166 2.01019 0.0 19.58 0 0.605 7.929 96.2 2.0459 5 403.0 14.7 \n",
+ "\n",
+ " B LSTAT MEDV \n",
+ "97 396.90 4.21 38.7 \n",
+ "98 393.53 3.57 43.8 \n",
+ "162 389.61 1.92 50.0 \n",
+ "163 388.45 3.32 50.0 \n",
+ "166 369.30 3.70 50.0 \n"
+ ]
+ }
+ ],
+ "source": [
+ "# 任務 1 & 2\n",
+ "selected_column = 'RM' # 或其他欄位\n",
+ "col_desc = df[selected_column].describe()\n",
+ "Q1 = col_desc['25%']\n",
+ "Q3 = col_desc['75%']\n",
+ "IQR = Q3 - Q1\n",
+ "lower_bound = Q1 - 1.5 * IQR\n",
+ "upper_bound = Q3 + 1.5 * IQR\n",
+ "print(f\"{selected_column} 欄位的 IQR 上限: {upper_bound}, 下限: {lower_bound}\\n\")\n",
+ "\n",
+ "# 任務 3\n",
+ "outliers = df[(df[selected_column] < lower_bound) | (df[selected_column] > upper_bound)]\n",
+ "print(f\"{selected_column} 欄位的異常值數量: {len(outliers)}\\n\")\n",
+ "print(\"部分異常值資料點:\\n\", outliers.head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# 課堂範例: 針對所有欄位計算異常值"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 478
+ },
+ "executionInfo": {
+ "elapsed": 129,
+ "status": "ok",
+ "timestamp": 1759742659739,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "2ClLvXDvq_1l",
+ "outputId": "88719d4e-32b8-45db-b7f1-95baf449a97c"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',\n",
+ " 'PTRATIO', 'B', 'LSTAT', 'MEDV', 'target'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CRIM | \n",
+ " ZN | \n",
+ " INDUS | \n",
+ " CHAS | \n",
+ " NOX | \n",
+ " RM | \n",
+ " AGE | \n",
+ " DIS | \n",
+ " RAD | \n",
+ " TAX | \n",
+ " ... | \n",
+ " RMoutlier | \n",
+ " AGEoutlier | \n",
+ " DISoutlier | \n",
+ " TAXoutlier | \n",
+ " PTRATIOoutlier | \n",
+ " Boutlier | \n",
+ " LSTAToutlier | \n",
+ " MEDVoutlier | \n",
+ " targetoutlier | \n",
+ " out_sum | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0.00632 | \n",
+ " 18.0 | \n",
+ " 2.31 | \n",
+ " 0 | \n",
+ " 0.538 | \n",
+ " 6.575 | \n",
+ " 65.2 | \n",
+ " 4.0900 | \n",
+ " 1 | \n",
+ " 296.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0.02731 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 6.421 | \n",
+ " 78.9 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.02729 | \n",
+ " 0.0 | \n",
+ " 7.07 | \n",
+ " 0 | \n",
+ " 0.469 | \n",
+ " 7.185 | \n",
+ " 61.1 | \n",
+ " 4.9671 | \n",
+ " 2 | \n",
+ " 242.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.03237 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 6.998 | \n",
+ " 45.8 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.06905 | \n",
+ " 0.0 | \n",
+ " 2.18 | \n",
+ " 0 | \n",
+ " 0.458 | \n",
+ " 7.147 | \n",
+ " 54.2 | \n",
+ " 6.0622 | \n",
+ " 3 | \n",
+ " 222.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 501 | \n",
+ " 0.06263 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.593 | \n",
+ " 69.1 | \n",
+ " 2.4786 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 502 | \n",
+ " 0.04527 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.120 | \n",
+ " 76.7 | \n",
+ " 2.2875 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 503 | \n",
+ " 0.06076 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.976 | \n",
+ " 91.0 | \n",
+ " 2.1675 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 504 | \n",
+ " 0.10959 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.794 | \n",
+ " 89.3 | \n",
+ " 2.3889 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 505 | \n",
+ " 0.04741 | \n",
+ " 0.0 | \n",
+ " 11.93 | \n",
+ " 0 | \n",
+ " 0.573 | \n",
+ " 6.030 | \n",
+ " 80.8 | \n",
+ " 2.5050 | \n",
+ " 1 | \n",
+ " 273.0 | \n",
+ " ... | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
288 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX ... \\\n",
+ "0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296.0 ... \n",
+ "1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242.0 ... \n",
+ "2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242.0 ... \n",
+ "3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222.0 ... \n",
+ "4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222.0 ... \n",
+ ".. ... ... ... ... ... ... ... ... .. ... ... \n",
+ "501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273.0 ... \n",
+ "502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273.0 ... \n",
+ "503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273.0 ... \n",
+ "504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273.0 ... \n",
+ "505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273.0 ... \n",
+ "\n",
+ " RMoutlier AGEoutlier DISoutlier TAXoutlier PTRATIOoutlier Boutlier \\\n",
+ "0 False False False False False False \n",
+ "1 False False False False False False \n",
+ "2 False False False False False False \n",
+ "3 False False False False False False \n",
+ "4 False False False False False False \n",
+ ".. ... ... ... ... ... ... \n",
+ "501 False False False False False False \n",
+ "502 False False False False False False \n",
+ "503 False False False False False False \n",
+ "504 False False False False False False \n",
+ "505 False False False False False False \n",
+ "\n",
+ " LSTAToutlier MEDVoutlier targetoutlier out_sum \n",
+ "0 False False False 0 \n",
+ "1 False False False 0 \n",
+ "2 False False False 0 \n",
+ "3 False False False 0 \n",
+ "4 False False False 0 \n",
+ ".. ... ... ... ... \n",
+ "501 False False False 0 \n",
+ "502 False False False 0 \n",
+ "503 False False False 0 \n",
+ "504 False False False 0 \n",
+ "505 False False False 0 \n",
+ "\n",
+ "[288 rows x 29 columns]"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 載入必要的套件 \n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.datasets import fetch_openml\n",
+ "boston = fetch_openml(name='boston', version=1, as_frame=True)\n",
+ "df = boston.frame\n",
+ "df[\"target\"]=boston.target\n",
+ "print(df.columns)\n",
+ "\n",
+ "for col in df.select_dtypes(include=['float64', 'int64']).columns:\n",
+ " col_name = col + \"outlier\"\n",
+ " q1 = df[col].quantile(0.25)\n",
+ " q3 = df[col].quantile(0.75)\n",
+ " IQR = q3 - q1\n",
+ " lower_bound = q1 - 1.5 * IQR\n",
+ " upper_bound = q3 + 1.5 * IQR\n",
+ "\n",
+ " # 先全部設為 0(非離群值)\n",
+ " df[col_name] = False\n",
+ "\n",
+ " # 再將離群值的位置更新為 1\n",
+ " df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col_name] = True\n",
+ "df[\"out_sum\"]=0\n",
+ "for col in df.select_dtypes(include=['boolean']).columns:\n",
+ " if \"outlier\" in col:\n",
+ " #print(df[col],\"type\",type(df[col]))\n",
+ " df[\"out_sum\"] = df[\"out_sum\"] + df[col].astype(int)\n",
+ "\n",
+ "\n",
+ "df[df[\"out_sum\"]==0]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jwPNMU6W7nXg"
+ },
+ "source": [
+ "# 課堂範例: 偵測並移除重複資料"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "executionInfo": {
+ "elapsed": 19,
+ "status": "ok",
+ "timestamp": 1759743153219,
+ "user": {
+ "displayName": "黃鈺晴",
+ "userId": "04936038635646045030"
+ },
+ "user_tz": -480
+ },
+ "id": "RSBbM8KK8b13",
+ "outputId": "cf9ec649-1977-46b2-f47c-5d751ab8041c"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "原始資料:\n",
+ " Name Score\n",
+ "0 Alice 90\n",
+ "1 Bob 85\n",
+ "2 Charlie 78\n",
+ "3 Bob 85\n",
+ "4 Alice 90\n",
+ "5 Eve 95\n",
+ "\n",
+ "重複資料筆數: 2\n",
+ "\n",
+ "重複資料:\n",
+ " Name Score\n",
+ "3 Bob 85\n",
+ "4 Alice 90\n",
+ "\n",
+ "移除重複後的資料:\n",
+ " Name Score\n",
+ "0 Alice 90\n",
+ "1 Bob 85\n",
+ "2 Charlie 78\n",
+ "5 Eve 95\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 建立含重複資料的範例\n",
+ "data = {\n",
+ " 'Name': ['Alice', 'Bob', 'Charlie', 'Bob', 'Alice', 'Eve'],\n",
+ " 'Score': [90, 85, 78, 85, 90, 95]\n",
+ "}\n",
+ "df = pd.DataFrame(data)\n",
+ "\n",
+ "print(\"原始資料:\")\n",
+ "print(df)\n",
+ "\n",
+ "# 偵測重複資料\n",
+ "print(\"\\n重複資料筆數:\", df.duplicated().sum())\n",
+ "print(\"\\n重複資料:\")\n",
+ "print(df[df.duplicated()])\n",
+ "\n",
+ "# 移除重複資料(保留第一次出現)\n",
+ "df_cleaned = df.drop_duplicates()\n",
+ "print(\"\\n移除重複後的資料:\")\n",
+ "print(df_cleaned)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}