import pandas as pd
from sklearn import metrics
!wget -q --show-progress "https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%201%20-%205/Session%202b%20-%20Logistic%20Regression/cancer.csv"
data = pd.read_csv('cancer.csv')
data['diagnosis'].replace({'M':1, 'B':0}, inplace = True)
data.to_csv('cancer.csv')
del data
/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
cancer.csv 100%[===================>] 122.27K --.-KB/s in 0.05s
# First, import helpful Python tools for loading/navigating data
import os # Good for navigating your computer's files
import numpy as np # Great for lists (arrays) of numbers
import pandas as pd # Great for tables (google spreadsheets, microsoft excel, csv)
from sklearn.metrics import accuracy_score # Great for creating quick ML models
# This is the name of our data file, which was downloaded in the set up cell.
# Check out the file explorer (folder on the left toolbar) to see where that lives!
data_path = 'cancer.csv'
# Use the 'pd.read_csv(filepath)' function to read in read our data and store it
# in a variable called 'dataframe'
dataframe = pd.read_csv(data_path)
# Redefine `dataframe` to include only the columns discussed
dataframe = dataframe[['diagnosis', 'perimeter_mean', 'radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean', 'symmetry_mean']]
# Define a new, more descriptive `diagnosis_cat` column
dataframe['diagnosis_cat'] = dataframe['diagnosis'].astype('category').map({1: '1 (malignant)', 0: '0 (benign)'})
dataframe.head(20)
diagnosis | perimeter_mean | radius_mean | texture_mean | area_mean | smoothness_mean | concavity_mean | symmetry_mean | diagnosis_cat | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 122.80 | 17.99 | 10.38 | 1001.0 | 0.11840 | 0.30010 | 0.2419 | 1 (malignant) |
1 | 1 | 132.90 | 20.57 | 17.77 | 1326.0 | 0.08474 | 0.08690 | 0.1812 | 1 (malignant) |
2 | 1 | 130.00 | 19.69 | 21.25 | 1203.0 | 0.10960 | 0.19740 | 0.2069 | 1 (malignant) |
3 | 1 | 77.58 | 11.42 | 20.38 | 386.1 | 0.14250 | 0.24140 | 0.2597 | 1 (malignant) |
4 | 1 | 135.10 | 20.29 | 14.34 | 1297.0 | 0.10030 | 0.19800 | 0.1809 | 1 (malignant) |
5 | 1 | 82.57 | 12.45 | 15.70 | 477.1 | 0.12780 | 0.15780 | 0.2087 | 1 (malignant) |
6 | 1 | 119.60 | 18.25 | 19.98 | 1040.0 | 0.09463 | 0.11270 | 0.1794 | 1 (malignant) |
7 | 1 | 90.20 | 13.71 | 20.83 | 577.9 | 0.11890 | 0.09366 | 0.2196 | 1 (malignant) |
8 | 1 | 87.50 | 13.00 | 21.82 | 519.8 | 0.12730 | 0.18590 | 0.2350 | 1 (malignant) |
9 | 1 | 83.97 | 12.46 | 24.04 | 475.9 | 0.11860 | 0.22730 | 0.2030 | 1 (malignant) |
10 | 1 | 102.70 | 16.02 | 23.24 | 797.8 | 0.08206 | 0.03299 | 0.1528 | 1 (malignant) |
11 | 1 | 103.60 | 15.78 | 17.89 | 781.0 | 0.09710 | 0.09954 | 0.1842 | 1 (malignant) |
12 | 1 | 132.40 | 19.17 | 24.80 | 1123.0 | 0.09740 | 0.20650 | 0.2397 | 1 (malignant) |
13 | 1 | 103.70 | 15.85 | 23.95 | 782.7 | 0.08401 | 0.09938 | 0.1847 | 1 (malignant) |
14 | 1 | 93.60 | 13.73 | 22.61 | 578.3 | 0.11310 | 0.21280 | 0.2069 | 1 (malignant) |
15 | 1 | 96.73 | 14.54 | 27.54 | 658.8 | 0.11390 | 0.16390 | 0.2303 | 1 (malignant) |
16 | 1 | 94.74 | 14.68 | 20.13 | 684.5 | 0.09867 | 0.07395 | 0.1586 | 1 (malignant) |
17 | 1 | 108.10 | 16.13 | 20.68 | 798.8 | 0.11700 | 0.17220 | 0.2164 | 1 (malignant) |
18 | 1 | 130.00 | 19.81 | 22.15 | 1260.0 | 0.09831 | 0.14790 | 0.1582 | 1 (malignant) |
19 | 0 | 87.46 | 13.54 | 14.36 | 566.3 | 0.09779 | 0.06664 | 0.1885 | 0 (benign) |
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(dataframe, test_size = 0.2, random_state = 1)
logreg_model = linear_model.LogisticRegression()
X = ['perimeter_mean','radius_mean','texture_mean','area_mean','smoothness_mean','concavity_mean','symmetry_mean']
y = 'diagnosis_cat'
X_train = train_df[X]
print('X_train, our input variables:')
print(X_train.head())
print()
y_train = train_df[y]
print('y_train, our output variable:')
print(y_train.head())
X_train, our input variables:
perimeter_mean radius_mean texture_mean area_mean smoothness_mean \
408 117.80 17.99 20.66 991.7 0.10360
4 135.10 20.29 14.34 1297.0 0.10030
307 56.36 9.00 14.40 246.3 0.07005
386 78.78 12.21 14.09 462.0 0.08108
404 78.29 12.34 14.95 469.1 0.08682
concavity_mean symmetry_mean
408 0.120100 0.1992
4 0.198000 0.1809
307 0.003681 0.1788
386 0.068390 0.1646
404 0.021090 0.1571
y_train, our output variable:
408 1 (malignant)
4 1 (malignant)
307 0 (benign)
386 0 (benign)
404 0 (benign)
Name: diagnosis_cat, dtype: category
Categories (2, object): ['0 (benign)', '1 (malignant)']
logreg_model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
X_test = test_df[X]
y_test = test_df[y]
y_pred = logreg_model.predict(X_test)
test_df['predicted'] = y_pred
print(X_test)
print(y_pred)
perimeter_mean radius_mean texture_mean area_mean smoothness_mean \
421 98.22 14.690 13.98 656.1 0.10310
47 85.98 13.170 18.66 534.6 0.11580
292 83.14 12.950 16.02 513.7 0.10050
186 118.60 18.310 18.58 1041.0 0.08588
414 96.71 15.130 29.81 719.5 0.08320
.. ... ... ... ... ...
172 102.50 15.460 11.89 736.9 0.12570
3 77.58 11.420 20.38 386.1 0.14250
68 58.79 9.029 17.33 250.5 0.10660
448 94.25 14.530 19.34 659.7 0.08388
442 88.37 13.780 15.79 585.9 0.08817
concavity_mean symmetry_mean
421 0.14500 0.2086
47 0.12260 0.2128
292 0.06155 0.1730
186 0.08169 0.1621
414 0.04686 0.1852
.. ... ...
172 0.20320 0.1966
3 0.24140 0.2597
68 0.31300 0.2111
448 0.08817 0.1473
442 0.01055 0.1405
[114 rows x 7 columns]
['1 (malignant)' '0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)'
'1 (malignant)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
'0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)' '1 (malignant)'
'0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
'0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
'0 (benign)' '1 (malignant)' '1 (malignant)' '1 (malignant)'
'1 (malignant)' '1 (malignant)' '0 (benign)' '1 (malignant)' '0 (benign)'
'0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)'
'0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
'0 (benign)' '1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)'
'1 (malignant)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
'0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)'
'0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)'
'0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)'
'0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
'0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
'1 (malignant)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
'0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
'0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
'1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
'0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
'1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
'1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)']
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
0.8771929824561403