import pandas as pd
from sklearn import metrics

!wget -q --show-progress "https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%201%20-%205/Session%202b%20-%20Logistic%20Regression/cancer.csv"

data = pd.read_csv('cancer.csv')
data['diagnosis'].replace({'M':1, 'B':0}, inplace = True)
data.to_csv('cancer.csv')
del data
/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.1
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"


cancer.csv          100%[===================>] 122.27K  --.-KB/s    in 0.05s   
# First, import helpful Python tools for loading/navigating data
import os             # Good for navigating your computer's files
import numpy as np    # Great for lists (arrays) of numbers
import pandas as pd   # Great for tables (google spreadsheets, microsoft excel, csv)
from sklearn.metrics import accuracy_score   # Great for creating quick ML models
# This is the name of our data file, which was downloaded in the set up cell.
# Check out the file explorer (folder on the left toolbar) to see where that lives!
data_path = 'cancer.csv'

# Use the 'pd.read_csv(filepath)' function to read in read our data and store it
# in a variable called 'dataframe'
dataframe = pd.read_csv(data_path)

# Redefine `dataframe` to include only the columns discussed
dataframe = dataframe[['diagnosis', 'perimeter_mean', 'radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean', 'symmetry_mean']]

# Define a new, more descriptive `diagnosis_cat` column
dataframe['diagnosis_cat'] = dataframe['diagnosis'].astype('category').map({1: '1 (malignant)', 0: '0 (benign)'})
dataframe.head(20)
diagnosis perimeter_mean radius_mean texture_mean area_mean smoothness_mean concavity_mean symmetry_mean diagnosis_cat
0 1 122.80 17.99 10.38 1001.0 0.11840 0.30010 0.2419 1 (malignant)
1 1 132.90 20.57 17.77 1326.0 0.08474 0.08690 0.1812 1 (malignant)
2 1 130.00 19.69 21.25 1203.0 0.10960 0.19740 0.2069 1 (malignant)
3 1 77.58 11.42 20.38 386.1 0.14250 0.24140 0.2597 1 (malignant)
4 1 135.10 20.29 14.34 1297.0 0.10030 0.19800 0.1809 1 (malignant)
5 1 82.57 12.45 15.70 477.1 0.12780 0.15780 0.2087 1 (malignant)
6 1 119.60 18.25 19.98 1040.0 0.09463 0.11270 0.1794 1 (malignant)
7 1 90.20 13.71 20.83 577.9 0.11890 0.09366 0.2196 1 (malignant)
8 1 87.50 13.00 21.82 519.8 0.12730 0.18590 0.2350 1 (malignant)
9 1 83.97 12.46 24.04 475.9 0.11860 0.22730 0.2030 1 (malignant)
10 1 102.70 16.02 23.24 797.8 0.08206 0.03299 0.1528 1 (malignant)
11 1 103.60 15.78 17.89 781.0 0.09710 0.09954 0.1842 1 (malignant)
12 1 132.40 19.17 24.80 1123.0 0.09740 0.20650 0.2397 1 (malignant)
13 1 103.70 15.85 23.95 782.7 0.08401 0.09938 0.1847 1 (malignant)
14 1 93.60 13.73 22.61 578.3 0.11310 0.21280 0.2069 1 (malignant)
15 1 96.73 14.54 27.54 658.8 0.11390 0.16390 0.2303 1 (malignant)
16 1 94.74 14.68 20.13 684.5 0.09867 0.07395 0.1586 1 (malignant)
17 1 108.10 16.13 20.68 798.8 0.11700 0.17220 0.2164 1 (malignant)
18 1 130.00 19.81 22.15 1260.0 0.09831 0.14790 0.1582 1 (malignant)
19 0 87.46 13.54 14.36 566.3 0.09779 0.06664 0.1885 0 (benign)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataframe, test_size = 0.2, random_state = 1)
logreg_model = linear_model.LogisticRegression()
X = ['perimeter_mean','radius_mean','texture_mean','area_mean','smoothness_mean','concavity_mean','symmetry_mean']
y = 'diagnosis_cat'

X_train = train_df[X]
print('X_train, our input variables:')
print(X_train.head())
print()

y_train = train_df[y]
print('y_train, our output variable:')
print(y_train.head())
X_train, our input variables:
     perimeter_mean  radius_mean  texture_mean  area_mean  smoothness_mean  \
408          117.80        17.99         20.66      991.7          0.10360   
4            135.10        20.29         14.34     1297.0          0.10030   
307           56.36         9.00         14.40      246.3          0.07005   
386           78.78        12.21         14.09      462.0          0.08108   
404           78.29        12.34         14.95      469.1          0.08682   

     concavity_mean  symmetry_mean  
408        0.120100         0.1992  
4          0.198000         0.1809  
307        0.003681         0.1788  
386        0.068390         0.1646  
404        0.021090         0.1571  

y_train, our output variable:
408    1 (malignant)
4      1 (malignant)
307       0 (benign)
386       0 (benign)
404       0 (benign)
Name: diagnosis_cat, dtype: category
Categories (2, object): ['0 (benign)', '1 (malignant)']
logreg_model.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
X_test = test_df[X]
y_test = test_df[y]
y_pred = logreg_model.predict(X_test)
test_df['predicted'] = y_pred
print(X_test)
print(y_pred)
     perimeter_mean  radius_mean  texture_mean  area_mean  smoothness_mean  \
421           98.22       14.690         13.98      656.1          0.10310   
47            85.98       13.170         18.66      534.6          0.11580   
292           83.14       12.950         16.02      513.7          0.10050   
186          118.60       18.310         18.58     1041.0          0.08588   
414           96.71       15.130         29.81      719.5          0.08320   
..              ...          ...           ...        ...              ...   
172          102.50       15.460         11.89      736.9          0.12570   
3             77.58       11.420         20.38      386.1          0.14250   
68            58.79        9.029         17.33      250.5          0.10660   
448           94.25       14.530         19.34      659.7          0.08388   
442           88.37       13.780         15.79      585.9          0.08817   

     concavity_mean  symmetry_mean  
421         0.14500         0.2086  
47          0.12260         0.2128  
292         0.06155         0.1730  
186         0.08169         0.1621  
414         0.04686         0.1852  
..              ...            ...  
172         0.20320         0.1966  
3           0.24140         0.2597  
68          0.31300         0.2111  
448         0.08817         0.1473  
442         0.01055         0.1405  

[114 rows x 7 columns]
['1 (malignant)' '0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)'
 '1 (malignant)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
 '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '1 (malignant)' '1 (malignant)' '1 (malignant)'
 '1 (malignant)' '1 (malignant)' '0 (benign)' '1 (malignant)' '0 (benign)'
 '0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)'
 '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
 '0 (benign)' '1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)'
 '1 (malignant)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
 '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)'
 '0 (benign)' '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)'
 '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)'
 '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '1 (malignant)' '1 (malignant)' '0 (benign)' '0 (benign)'
 '1 (malignant)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '0 (benign)' '1 (malignant)' '0 (benign)' '1 (malignant)'
 '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
 '1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
 '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)'
 '1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '1 (malignant)'
 '1 (malignant)' '0 (benign)' '0 (benign)' '0 (benign)' '0 (benign)']
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
0.8771929824561403