{
"cells": [
{
"cell_type": "code",
"execution_count": 224,
"id": "23296b19-b819-49ea-a95b-00a2b3ed1840",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (1.2.2)\n",
"Requirement already satisfied: numpy>=1.17.3 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from scikit-learn) (1.25.0)\n",
"Requirement already satisfied: scipy>=1.3.2 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from scikit-learn) (1.10.1)\n",
"Requirement already satisfied: joblib>=1.1.1 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from scikit-learn) (1.2.0)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages (from scikit-learn) (3.1.0)\n"
]
}
],
"source": [
"!pip install scikit-learn"
]
},
{
"cell_type": "code",
"execution_count": 249,
"id": "ba97b632-b1b5-48ac-b3a7-3d14dd2ab7ef",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn import tree\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"id": "0018cbce-f858-4863-a25e-6abe28261112",
"metadata": {},
"source": [
"## Loading Data"
]
},
{
"cell_type": "code",
"execution_count": 226,
"id": "74e2ecec-5ba3-405b-ae99-6b6a51d54e26",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" Favorite Transport \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 5 \n",
" female \n",
" NaN \n",
" bicycle \n",
" \n",
" \n",
" 1 \n",
" 8 \n",
" male \n",
" NaN \n",
" scooter \n",
" \n",
" \n",
" 2 \n",
" 10 \n",
" female \n",
" NaN \n",
" bicycle \n",
" \n",
" \n",
" 3 \n",
" 14 \n",
" male \n",
" NaN \n",
" metro \n",
" \n",
" \n",
" 4 \n",
" 16 \n",
" male \n",
" NaN \n",
" metro \n",
" \n",
" \n",
" 5 \n",
" 18 \n",
" female \n",
" NaN \n",
" metro \n",
" \n",
" \n",
" 6 \n",
" 20 \n",
" male \n",
" 200.0 \n",
" scooter \n",
" \n",
" \n",
" 7 \n",
" 22 \n",
" female \n",
" 500.0 \n",
" scooter \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income Favorite Transport\n",
"0 5 female NaN bicycle\n",
"1 8 male NaN scooter\n",
"2 10 female NaN bicycle\n",
"3 14 male NaN metro\n",
"4 16 male NaN metro\n",
"5 18 female NaN metro\n",
"6 20 male 200.0 scooter\n",
"7 22 female 500.0 scooter"
]
},
"execution_count": 226,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('vehicles.csv')\n",
"df.head(8)"
]
},
{
"cell_type": "code",
"execution_count": 227,
"id": "9d56d497-2fc5-4724-8f4a-47ef6fb09f58",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age int64\n",
"Gender object\n",
"Income float64\n",
"Favorite Transport object\n",
"dtype: object"
]
},
"execution_count": 227,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "fab98c7a-a6ef-4583-bde6-e28d11fc6c54",
"metadata": {},
"source": [
"## Cleaning"
]
},
{
"cell_type": "code",
"execution_count": 228,
"id": "60b8aea3-47ae-4457-91b2-6698cb0442b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age 0\n",
"Gender 0\n",
"Income 6\n",
"Favorite Transport 0\n",
"dtype: int64"
]
},
"execution_count": 228,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 229,
"id": "d0deb832-8d8e-499f-98d1-0d2181aca231",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" Favorite Transport \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 5 \n",
" female \n",
" 0.0 \n",
" bicycle \n",
" \n",
" \n",
" 1 \n",
" 8 \n",
" male \n",
" 0.0 \n",
" scooter \n",
" \n",
" \n",
" 2 \n",
" 10 \n",
" female \n",
" 0.0 \n",
" bicycle \n",
" \n",
" \n",
" 3 \n",
" 14 \n",
" male \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 4 \n",
" 16 \n",
" male \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 5 \n",
" 18 \n",
" female \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 6 \n",
" 20 \n",
" male \n",
" 200.0 \n",
" scooter \n",
" \n",
" \n",
" 7 \n",
" 22 \n",
" female \n",
" 500.0 \n",
" scooter \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income Favorite Transport\n",
"0 5 female 0.0 bicycle\n",
"1 8 male 0.0 scooter\n",
"2 10 female 0.0 bicycle\n",
"3 14 male 0.0 metro\n",
"4 16 male 0.0 metro\n",
"5 18 female 0.0 metro\n",
"6 20 male 200.0 scooter\n",
"7 22 female 500.0 scooter"
]
},
"execution_count": 229,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Income'].fillna(0.0, inplace=True)\n",
"df.head(8)"
]
},
{
"cell_type": "code",
"execution_count": 230,
"id": "558e2185-9d4f-49a8-811d-e99b03f3b2c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age 0\n",
"Gender 0\n",
"Income 0\n",
"Favorite Transport 0\n",
"dtype: int64"
]
},
"execution_count": 230,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "markdown",
"id": "e3fbf5ee-f577-46a5-825a-fa62908aa360",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Encoding"
]
},
{
"cell_type": "code",
"execution_count": 231,
"id": "30954355-f1e9-498c-bda7-37bccfa5a418",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['female', 'male'], dtype=object)"
]
},
"execution_count": 231,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['Gender'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 232,
"id": "ddbe3c0d-9d65-4386-b3e5-bc0ca620910b",
"metadata": {},
"outputs": [],
"source": [
"# # OPTION 1\n",
"# df.replace({\n",
"# 'Gender': {\n",
"# 'male': 0,\n",
"# 'female': 1\n",
"# }\n",
"# }, inplace=True)\n",
"# df.head()"
]
},
{
"cell_type": "code",
"execution_count": 233,
"id": "7b702899-a787-49f0-866d-883fe016286f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" Favorite Transport \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 5 \n",
" 0 \n",
" 0.0 \n",
" bicycle \n",
" \n",
" \n",
" 1 \n",
" 8 \n",
" 1 \n",
" 0.0 \n",
" scooter \n",
" \n",
" \n",
" 2 \n",
" 10 \n",
" 0 \n",
" 0.0 \n",
" bicycle \n",
" \n",
" \n",
" 3 \n",
" 14 \n",
" 1 \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 4 \n",
" 16 \n",
" 1 \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 5 \n",
" 18 \n",
" 0 \n",
" 0.0 \n",
" metro \n",
" \n",
" \n",
" 6 \n",
" 20 \n",
" 1 \n",
" 200.0 \n",
" scooter \n",
" \n",
" \n",
" 7 \n",
" 22 \n",
" 0 \n",
" 500.0 \n",
" scooter \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income Favorite Transport\n",
"0 5 0 0.0 bicycle\n",
"1 8 1 0.0 scooter\n",
"2 10 0 0.0 bicycle\n",
"3 14 1 0.0 metro\n",
"4 16 1 0.0 metro\n",
"5 18 0 0.0 metro\n",
"6 20 1 200.0 scooter\n",
"7 22 0 500.0 scooter"
]
},
"execution_count": 233,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# OPTION 2\n",
"label_encoder = LabelEncoder()\n",
"df['Gender'] = label_encoder.fit_transform(df['Gender'])\n",
"df.head(8)"
]
},
{
"cell_type": "code",
"execution_count": 234,
"id": "2738a4db-ddf7-4cb5-a925-8a1783c94ea3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age int64\n",
"Gender int64\n",
"Income float64\n",
"Favorite Transport object\n",
"dtype: object"
]
},
"execution_count": 234,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "c738286a-bc6f-40cb-9638-01b6bd35013a",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
"execution_count": 235,
"id": "7ec00f4b-46fd-449e-a664-42c78edd06e1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 5 \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 8 \n",
" 1 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 10 \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income\n",
"0 5 0 0.0\n",
"1 8 1 0.0\n",
"2 10 0 0.0"
]
},
"execution_count": 235,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = df.drop(columns='Favorite Transport')\n",
"X.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 236,
"id": "9d1f31e2-9795-44b1-8398-e0782875a25b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 bicycle\n",
"1 scooter\n",
"2 bicycle\n",
"Name: Favorite Transport, dtype: object"
]
},
"execution_count": 236,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = df['Favorite Transport']\n",
"y.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 237,
"id": "25a64b54-86e8-4c36-8b48-fd2469e8d88d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"DecisionTreeClassifier() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"DecisionTreeClassifier()"
]
},
"execution_count": 237,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = DecisionTreeClassifier()\n",
"model"
]
},
{
"cell_type": "code",
"execution_count": 238,
"id": "6f65f956-3a41-4b56-95ff-378146ca3a75",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"DecisionTreeClassifier() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"DecisionTreeClassifier()"
]
},
"execution_count": 238,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.fit(X, y)"
]
},
{
"cell_type": "markdown",
"id": "3b1f9f8f-d013-42a4-92e4-80714309aded",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"source": [
"## Prediction"
]
},
{
"cell_type": "code",
"execution_count": 239,
"id": "5e162fb6-7657-4cdf-878f-eb51af5a67a5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 12 \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 30 \n",
" 0 \n",
" 4000.0 \n",
" \n",
" \n",
" 2 \n",
" 75 \n",
" 1 \n",
" 50000.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income\n",
"0 12 0 0.0\n",
"1 30 0 4000.0\n",
"2 75 1 50000.0"
]
},
"execution_count": 239,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_df = pd.DataFrame({\n",
" 'Age': [12, 30, 75],\n",
" 'Gender': [0, 0, 1],\n",
" 'Income': [0.0, 4000, 50000]\n",
"})\n",
"test_df"
]
},
{
"cell_type": "code",
"execution_count": 240,
"id": "e91b6190-6284-417a-8604-a2fb0cf9e180",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['bicycle', 'car', 'helicopter'], dtype=object)"
]
},
"execution_count": 240,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.predict(test_df)"
]
},
{
"cell_type": "markdown",
"id": "10e309d8-e47e-46b8-81ab-01ab41e8637e",
"metadata": {},
"source": [
"## Exporting to the DOT file"
]
},
{
"cell_type": "code",
"execution_count": 241,
"id": "2f20590b-2523-48f1-af59-c7410d144d03",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [],
"source": [
"tree.export_graphviz(model, out_file='decision_tree_model.dot', feature_names=['Age', 'Gender', 'Income'], filled=True, class_names=sorted(y.unique()))"
]
},
{
"cell_type": "markdown",
"id": "3d27849c-a508-4c60-8291-07168fc5b9e2",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 242,
"id": "740f03d0-9585-4bdd-9432-5053d2f51fae",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"(26, 3)"
]
},
"execution_count": 242,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 243,
"id": "4521cc90-c92a-476a-8b14-1460c3922789",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(20, 3)"
]
},
"execution_count": 243,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 244,
"id": "2950dc61-cd35-43ed-8475-156089b32e66",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6, 3)"
]
},
"execution_count": 244,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test.shape"
]
},
{
"cell_type": "code",
"execution_count": 245,
"id": "a21a640a-b237-49f9-9d5a-48fd0e5a991e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"DecisionTreeClassifier() In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org. "
],
"text/plain": [
"DecisionTreeClassifier()"
]
},
"execution_count": 245,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = DecisionTreeClassifier()\n",
"model.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 246,
"id": "be4263fa-61de-4b12-b36c-ece82c021251",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Age \n",
" Gender \n",
" Income \n",
" \n",
" \n",
" \n",
" \n",
" 14 \n",
" 45 \n",
" 1 \n",
" 3000.0 \n",
" \n",
" \n",
" 9 \n",
" 25 \n",
" 0 \n",
" 800.0 \n",
" \n",
" \n",
" 15 \n",
" 47 \n",
" 0 \n",
" 6000.0 \n",
" \n",
" \n",
" 0 \n",
" 5 \n",
" 0 \n",
" 0.0 \n",
" \n",
" \n",
" 18 \n",
" 56 \n",
" 0 \n",
" 1400.0 \n",
" \n",
" \n",
" 4 \n",
" 16 \n",
" 1 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Gender Income\n",
"14 45 1 3000.0\n",
"9 25 0 800.0\n",
"15 47 0 6000.0\n",
"0 5 0 0.0\n",
"18 56 0 1400.0\n",
"4 16 1 0.0"
]
},
"execution_count": 246,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 247,
"id": "9861518b-5884-4a98-9a3c-bea957e15761",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"array(['car', 'scooter', 'car', 'scooter', 'taxi', 'metro'], dtype=object)"
]
},
"execution_count": 247,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = model.predict(X_test)\n",
"predictions"
]
},
{
"cell_type": "code",
"execution_count": 248,
"id": "2755b217-c038-4d57-a471-58f1959e75bd",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"0.5"
]
},
"execution_count": 248,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_accuracy_score = accuracy_score(y_test, predictions)\n",
"model_accuracy_score"
]
},
{
"cell_type": "markdown",
"id": "3f49c42f-808f-4f72-b798-ae80949593e4",
"metadata": {},
"source": [
"## Charts"
]
},
{
"cell_type": "code",
"execution_count": 250,
"id": "09994882-7ba1-4f66-93bc-7ffd240d2698",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.countplot(x=df['Gender'], hue=df['Favorite Transport'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 251,
"id": "d20571d9-7458-4073-ba17-c49267cbe596",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(x=df['Income'], hue=df['Favorite Transport'])\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}