|
|
@@ -0,0 +1,518 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 75,
|
|
|
+ "id": "b612c4c1-fa3c-47b9-a8ce-9e32f371e160",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "import urllib.request\n",
|
|
|
+ "import zipfile\n",
|
|
|
+ "import os\n",
|
|
|
+ "from pathlib import Path\n",
|
|
|
+ "\n",
|
|
|
+ "url = \"https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip\"\n",
|
|
|
+ "zip_path = \"sms_spam_collection.zip\"\n",
|
|
|
+ "extract_to = \"sms_spam_collection\"\n",
|
|
|
+ "new_file_path = Path(extract_to) / \"SMSSpamCollection.tsv\"\n",
|
|
|
+ "\n",
|
|
|
+ "def download_and_unzip(url, zip_path, extract_to, new_file_path):\n",
|
|
|
+ " # Check if the target file already exists\n",
|
|
|
+ " if new_file_path.exists():\n",
|
|
|
+ " print(f\"{new_file_path} already exists. Skipping download and extraction.\")\n",
|
|
|
+ " return\n",
|
|
|
+ "\n",
|
|
|
+ " # Downloading the file\n",
|
|
|
+ " with urllib.request.urlopen(url) as response:\n",
|
|
|
+ " with open(zip_path, \"wb\") as out_file:\n",
|
|
|
+ " out_file.write(response.read())\n",
|
|
|
+ "\n",
|
|
|
+ " # Unzipping the file\n",
|
|
|
+ " with zipfile.ZipFile(zip_path, 'r') as zip_ref:\n",
|
|
|
+ " zip_ref.extractall(extract_to)\n",
|
|
|
+ "\n",
|
|
|
+ " # Renaming the file to indicate its format\n",
|
|
|
+ " original_file = Path(extract_to) / \"SMSSpamCollection\"\n",
|
|
|
+ " os.rename(original_file, new_file_path)\n",
|
|
|
+ " print(f\"File download and saved as {new_file_path}\")\n",
|
|
|
+ "\n",
|
|
|
+ "# Execute the function\n",
|
|
|
+ "download_and_unzip(url, zip_path, extract_to, new_file_path)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 76,
|
|
|
+ "id": "69f32433-e19c-4066-b806-8f30b408107f",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "data": {
|
|
|
+ "text/html": [
|
|
|
+ "<div>\n",
|
|
|
+ "<style scoped>\n",
|
|
|
+ " .dataframe tbody tr th:only-of-type {\n",
|
|
|
+ " vertical-align: middle;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe tbody tr th {\n",
|
|
|
+ " vertical-align: top;\n",
|
|
|
+ " }\n",
|
|
|
+ "\n",
|
|
|
+ " .dataframe thead th {\n",
|
|
|
+ " text-align: right;\n",
|
|
|
+ " }\n",
|
|
|
+ "</style>\n",
|
|
|
+ "<table border=\"1\" class=\"dataframe\">\n",
|
|
|
+ " <thead>\n",
|
|
|
+ " <tr style=\"text-align: right;\">\n",
|
|
|
+ " <th></th>\n",
|
|
|
+ " <th>Label</th>\n",
|
|
|
+ " <th>Text</th>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </thead>\n",
|
|
|
+ " <tbody>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>0</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Aight text me when you're back at mu and I'll ...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>1</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Our Prashanthettan's mother passed away last n...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>2</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>No it will reach by 9 only. She telling she wi...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>3</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Do you know when the result.</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>4</th>\n",
|
|
|
+ " <td>spam</td>\n",
|
|
|
+ " <td>Hi. Customer Loyalty Offer:The NEW Nokia6650 M...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>...</th>\n",
|
|
|
+ " <td>...</td>\n",
|
|
|
+ " <td>...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>5567</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>I accidentally brought em home in the box</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>5568</th>\n",
|
|
|
+ " <td>spam</td>\n",
|
|
|
+ " <td>Moby Pub Quiz.Win a £100 High Street prize if ...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>5569</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Que pases un buen tiempo or something like that</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>5570</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Nowadays people are notixiquating the laxinorf...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " <tr>\n",
|
|
|
+ " <th>5571</th>\n",
|
|
|
+ " <td>ham</td>\n",
|
|
|
+ " <td>Ard 4 lor...</td>\n",
|
|
|
+ " </tr>\n",
|
|
|
+ " </tbody>\n",
|
|
|
+ "</table>\n",
|
|
|
+ "<p>5572 rows × 2 columns</p>\n",
|
|
|
+ "</div>"
|
|
|
+ ],
|
|
|
+ "text/plain": [
|
|
|
+ " Label Text\n",
|
|
|
+ "0 ham Aight text me when you're back at mu and I'll ...\n",
|
|
|
+ "1 ham Our Prashanthettan's mother passed away last n...\n",
|
|
|
+ "2 ham No it will reach by 9 only. She telling she wi...\n",
|
|
|
+ "3 ham Do you know when the result.\n",
|
|
|
+ "4 spam Hi. Customer Loyalty Offer:The NEW Nokia6650 M...\n",
|
|
|
+ "... ... ...\n",
|
|
|
+ "5567 ham I accidentally brought em home in the box\n",
|
|
|
+ "5568 spam Moby Pub Quiz.Win a £100 High Street prize if ...\n",
|
|
|
+ "5569 ham Que pases un buen tiempo or something like that\n",
|
|
|
+ "5570 ham Nowadays people are notixiquating the laxinorf...\n",
|
|
|
+ "5571 ham Ard 4 lor...\n",
|
|
|
+ "\n",
|
|
|
+ "[5572 rows x 2 columns]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ "execution_count": 76,
|
|
|
+ "metadata": {},
|
|
|
+ "output_type": "execute_result"
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "import pandas as pd\n",
|
|
|
+ "\n",
|
|
|
+ "df = pd.read_csv(new_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
|
|
+ "df = df.sample(frac=1, random_state=123).reset_index(drop=True) # Shuffle the DataFrame\n",
|
|
|
+ "df"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 77,
|
|
|
+ "id": "4b7beeba-9f3a-45f0-b2dc-76bb155a8f0e",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Label\n",
|
|
|
+ "ham 4825\n",
|
|
|
+ "spam 747\n",
|
|
|
+ "Name: count, dtype: int64\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Class distribution\n",
|
|
|
+ "print(df[\"Label\"].value_counts())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 78,
|
|
|
+ "id": "b3db862a-9e03-4715-babb-9b699e4f4a36",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Label\n",
|
|
|
+ "spam 747\n",
|
|
|
+ "ham 747\n",
|
|
|
+ "Name: count, dtype: int64\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Count the instances of 'spam'\n",
|
|
|
+ "n_spam = df[df[\"Label\"] == \"spam\"].shape[0]\n",
|
|
|
+ "\n",
|
|
|
+ "# Randomly sample 'ham' instances to match the number of 'spam' instances\n",
|
|
|
+ "ham_sampled = df[df[\"Label\"] == \"ham\"].sample(n_spam)\n",
|
|
|
+ "\n",
|
|
|
+ "# Combine the sampled 'ham' with all 'spam'\n",
|
|
|
+ "balanced_df = pd.concat([ham_sampled, df[df[\"Label\"] == \"spam\"]])\n",
|
|
|
+ "\n",
|
|
|
+ "# Shuffle the DataFrame\n",
|
|
|
+ "balanced_df = balanced_df.sample(frac=1).reset_index(drop=True)\n",
|
|
|
+ "\n",
|
|
|
+ "# Now balanced_df is the balanced DataFrame\n",
|
|
|
+ "print(balanced_df[\"Label\"].value_counts())"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 79,
|
|
|
+ "id": "0af991e5-98ef-439a-a43d-63a581a2cc6d",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "df[\"Label\"] = df[\"Label\"].map({\"ham\": 0, \"spam\": 1})"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 80,
|
|
|
+ "id": "2f5b00ef-e3ed-4819-b271-5f355848feb5",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Training set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "0 0.86612\n",
|
|
|
+ "1 0.13388\n",
|
|
|
+ "Name: proportion, dtype: float64\n",
|
|
|
+ "\n",
|
|
|
+ "Validation set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "0 0.866906\n",
|
|
|
+ "1 0.133094\n",
|
|
|
+ "Name: proportion, dtype: float64\n",
|
|
|
+ "\n",
|
|
|
+ "Test set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "0 0.864816\n",
|
|
|
+ "1 0.135184\n",
|
|
|
+ "Name: proportion, dtype: float64\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Define split ratios\n",
|
|
|
+ "train_size, validation_size = 0.7, 0.1\n",
|
|
|
+ "# Test size is implied to be 0.2 as the remainder\n",
|
|
|
+ "\n",
|
|
|
+ "# Split the data\n",
|
|
|
+ "def stratified_split(df, stratify_col, train_frac, validation_frac):\n",
|
|
|
+ " stratified_train = pd.DataFrame()\n",
|
|
|
+ " stratified_validation = pd.DataFrame()\n",
|
|
|
+ " stratified_test = pd.DataFrame()\n",
|
|
|
+ "\n",
|
|
|
+ " # Stratify split by the unique values in the column\n",
|
|
|
+ " for value in df[stratify_col].unique():\n",
|
|
|
+ " # Filter the DataFrame for the class\n",
|
|
|
+ " df_class = df[df[stratify_col] == value]\n",
|
|
|
+ " \n",
|
|
|
+ " # Calculate class split sizes\n",
|
|
|
+ " train_end = int(len(df_class) * train_frac)\n",
|
|
|
+ " validation_end = train_end + int(len(df_class) * validation_frac)\n",
|
|
|
+ " \n",
|
|
|
+ " # Slice the DataFrame to get the sets\n",
|
|
|
+ " stratified_train = pd.concat([stratified_train, df_class[:train_end]], axis=0)\n",
|
|
|
+ " stratified_validation = pd.concat([stratified_validation, df_class[train_end:validation_end]], axis=0)\n",
|
|
|
+ " stratified_test = pd.concat([stratified_test, df_class[validation_end:]], axis=0)\n",
|
|
|
+ "\n",
|
|
|
+ " # Shuffle the sets again\n",
|
|
|
+ " stratified_train = stratified_train.sample(frac=1, random_state=123).reset_index(drop=True)\n",
|
|
|
+ " stratified_validation = stratified_validation.sample(frac=1, random_state=123).reset_index(drop=True)\n",
|
|
|
+ " stratified_test = stratified_test.sample(frac=1, random_state=123).reset_index(drop=True)\n",
|
|
|
+ "\n",
|
|
|
+ " return stratified_train, stratified_validation, stratified_test\n",
|
|
|
+ "\n",
|
|
|
+ "# Apply the stratified split function\n",
|
|
|
+ "train_df, validation_df, test_df = stratified_split(df, \"Label\", train_size, validation_size)\n",
|
|
|
+ "\n",
|
|
|
+ "# Check the results\n",
|
|
|
+ "print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
|
|
|
+ "print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
|
|
|
+ "print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 81,
|
|
|
+ "id": "65808167-2b93-45b0-8506-ce722732ce77",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Training set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "ham 0.5\n",
|
|
|
+ "spam 0.5\n",
|
|
|
+ "Name: proportion, dtype: float64\n",
|
|
|
+ "\n",
|
|
|
+ "Validation set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "ham 0.5\n",
|
|
|
+ "spam 0.5\n",
|
|
|
+ "Name: proportion, dtype: float64\n",
|
|
|
+ "\n",
|
|
|
+ "Test set:\n",
|
|
|
+ "Label\n",
|
|
|
+ "spam 0.5\n",
|
|
|
+ "ham 0.5\n",
|
|
|
+ "Name: proportion, dtype: float64\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "# Define split ratios\n",
|
|
|
+ "train_size, validation_size = 0.7, 0.1\n",
|
|
|
+ "# Test size is implied to be 0.2 as the remainder\n",
|
|
|
+ "\n",
|
|
|
+ "# Apply the stratified split function\n",
|
|
|
+ "train_df, validation_df, test_df = stratified_split(balanced_df, \"Label\", train_size, validation_size)\n",
|
|
|
+ "\n",
|
|
|
+ "# Check the results\n",
|
|
|
+ "print(f\"Training set:\\n{train_df['Label'].value_counts(normalize=True)}\")\n",
|
|
|
+ "print(f\"\\nValidation set:\\n{validation_df['Label'].value_counts(normalize=True)}\")\n",
|
|
|
+ "print(f\"\\nTest set:\\n{test_df['Label'].value_counts(normalize=True)}\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "id": "fae87bc1-14ca-4f89-8e12-49f77b0ec00d",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "## Scikit-learn baseline"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 82,
|
|
|
+ "id": "180318b7-de18-4b05-b84a-ba97c72b9d8e",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
|
|
|
+ "from sklearn.linear_model import LogisticRegression\n",
|
|
|
+ "from sklearn.metrics import accuracy_score, balanced_accuracy_score"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 83,
|
|
|
+ "id": "25090b7c-f516-4be2-8083-3a7187fe4635",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "vectorizer = CountVectorizer()\n",
|
|
|
+ "\n",
|
|
|
+ "X_train = vectorizer.fit_transform(train_df[\"Text\"])\n",
|
|
|
+ "X_val = vectorizer.transform(validation_df[\"Text\"])\n",
|
|
|
+ "X_test = vectorizer.transform(test_df[\"Text\"])\n",
|
|
|
+ "\n",
|
|
|
+ "y_train, y_val, y_test = train_df[\"Label\"], validation_df[\"Label\"], test_df[\"Label\"]"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 84,
|
|
|
+ "id": "0247de3a-88f0-4b9c-becd-157baf3acf49",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "def eval(model, X_train, y_train, X_val, y_val, X_test, y_test):\n",
|
|
|
+ " # Making predictions\n",
|
|
|
+ " y_pred_train = model.predict(X_train)\n",
|
|
|
+ " y_pred_val = model.predict(X_val)\n",
|
|
|
+ " y_pred_test = model.predict(X_test)\n",
|
|
|
+ " \n",
|
|
|
+ " # Calculating accuracy and balanced accuracy\n",
|
|
|
+ " accuracy_train = accuracy_score(y_train, y_pred_train)\n",
|
|
|
+ " balanced_accuracy_train = balanced_accuracy_score(y_train, y_pred_train)\n",
|
|
|
+ " \n",
|
|
|
+ " accuracy_val = accuracy_score(y_val, y_pred_val)\n",
|
|
|
+ " balanced_accuracy_val = balanced_accuracy_score(y_val, y_pred_val)\n",
|
|
|
+ "\n",
|
|
|
+ " accuracy_test = accuracy_score(y_test, y_pred_test)\n",
|
|
|
+ " balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred_test)\n",
|
|
|
+ " \n",
|
|
|
+ " # Printing the results\n",
|
|
|
+ " print(f\"Training Accuracy: {accuracy_train*100:.2f}%\")\n",
|
|
|
+ " print(f\"Validation Accuracy: {accuracy_val*100:.2f}%\")\n",
|
|
|
+ " print(f\"Test Accuracy: {accuracy_test*100:.2f}%\")\n",
|
|
|
+ " \n",
|
|
|
+ " print(f\"\\nTraining Balanced Accuracy: {balanced_accuracy_train*100:.2f}%\")\n",
|
|
|
+ " print(f\"Validation Balanced Accuracy: {balanced_accuracy_val*100:.2f}%\")\n",
|
|
|
+ " print(f\"Test Balanced Accuracy: {balanced_accuracy_test*100:.2f}%\")"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 85,
|
|
|
+ "id": "c29c6dfc-f72d-40ab-8cb5-783aad1a15ab",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Training Accuracy: 50.00%\n",
|
|
|
+ "Validation Accuracy: 50.00%\n",
|
|
|
+ "Test Accuracy: 50.00%\n",
|
|
|
+ "\n",
|
|
|
+ "Training Balanced Accuracy: 50.00%\n",
|
|
|
+ "Validation Balanced Accuracy: 50.00%\n",
|
|
|
+ "Test Balanced Accuracy: 50.00%\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "from sklearn.dummy import DummyClassifier\n",
|
|
|
+ "\n",
|
|
|
+ "# Create a dummy classifier with the strategy to predict the most frequent class\n",
|
|
|
+ "dummy_clf = DummyClassifier(strategy=\"most_frequent\")\n",
|
|
|
+ "dummy_clf.fit(X_train, y_train)\n",
|
|
|
+ "\n",
|
|
|
+ "eval(dummy_clf, X_train, y_train, X_val, y_val, X_test, y_test)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 86,
|
|
|
+ "id": "088a8a3a-3b74-4d10-a51b-cb662569ae39",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Training Accuracy: 99.81%\n",
|
|
|
+ "Validation Accuracy: 95.27%\n",
|
|
|
+ "Test Accuracy: 96.03%\n",
|
|
|
+ "\n",
|
|
|
+ "Training Balanced Accuracy: 99.81%\n",
|
|
|
+ "Validation Balanced Accuracy: 95.27%\n",
|
|
|
+ "Test Balanced Accuracy: 96.03%\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "model = LogisticRegression(max_iter=1000)\n",
|
|
|
+ "model.fit(X_train, y_train)\n",
|
|
|
+ "eval(model, X_train, y_train, X_val, y_val, X_test, y_test)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "id": "34411348-45bc-4b01-bebf-b3602c002ef1",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "id": "5a9bc6b1-c8b9-4d4f-bfe4-c5a4a8b0c756",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3 (ipykernel)",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.10.12"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 5
|
|
|
+}
|