EX NO: 1
DECISION TREE BASED ID3 ALGORITHM
DATE:
AIM:
To write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.
Source Code:
import numpy as np import math
from data_loader import read_data
class Node:
def init (self, attribute): self.attribute = attribute self.children = [] self.answer = ""
def str (self): return self.attribute
def subtables(data, col, delete): dict = {}
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int32) for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]: count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]): if data[y, col] == items[x]:
dict[items[x]][pos] = data[y] pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1) return items, dict
def entropy(S):
Iitems = np.unique(S) if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1)) sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for count in counts:
sums += -1 * count * math.log(count, 2) return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1)) intrinsic = np.zeros((items.shape[0], 1)) for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0) entropies[x] = ratio * entropy(dict[items[x]][:, -1]) intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1]) iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]): total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1: node = Node("")
node.answer = np.unique(data[:, -1])[0] return node
gains = np.zeros((data.shape[1] - 1, 1)) for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col) split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata) node.children.append((items[x], child))
return node def empty(size):
s = ""
for x in range(size): s += " "
return s
def print_tree(node, level): if node.answer != "":
print(empty(level), node.answer) return
print(empty(level), node.attribute) for value, n in node.children:
print(empty(level + 1), value) print_tree(n, level + 2)
metadata, traindata = read_data("tennis.csv") data = np.array(traindata)
node = create_node(data, metadata) print_tree(node, 0)
Data_loader.py
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',') headers = next(datareader)
metadata = [] traindata = []
for name in headers: metadata.append(name)
for row in datareader: traindata.append(row)
return (metadata, traindata)
Tennis.csv
outlook,temperature,humidity,wind, answer sunny,hot,high,weak,no sunny,hot,high,strong,no overcast,hot,high,weak,yes rain,mild,high,weak,yes rain,cool,normal,weak,yes rain,cool,normal,strong,no overcast,cool,normal,strong,yes sunny,mild,high,weak,no sunny,cool,normal,weak,yes rain,mild,normal,weak,yes sunny,mild,normal,strong,yes overcast,mild,high,strong,yes overcast,hot,normal,weak,yes rain,mild,high,strong,no
Output
outlook
overcast b'yes'
rain
wind
b'strong' b'no' b'weak' b'yes'
sunny
humidity b'high' b'no'
b'normal' b'yes
RESULT:
EX NO: 2
BACK PROPAGATION ALGORITHM
AIM:
To write a program for implementing the Back propagation algorithm and test the same using appropriate data sets.
Source Code:
import numpy as np
X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)
y = np.array(([92], [86], [89]), dtype=float)
X = X/np.amax(X,axis=0) # maximum of X array longitudinally y = y/100
#Sigmoid Function def sigmoid (x):
return 1/(1 + np.exp(-x))
#Derivative of Sigmoid Function def derivatives_sigmoid(x):
return x * (1 - x)
#Variable initialization
epoch=7000 #Setting training iterations lr=0.1 #Setting learning rate
inputlayer_neurons = 2 #number of features in data set hiddenlayer_neurons = 3 #number of hidden layers neurons output_neurons = 1 #number of neurons at output layer #weight and bias initialization
wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons)) bh=np.random.uniform(size=(1,hiddenlayer_neurons)) wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons)) bout=np.random.uniform(size=(1,output_neurons))
#draws a random range of numbers uniformly of dim x*y for i in range(epoch):
#Forward Propogation hinp1=np.dot(X,wh) hinp=hinp1 + bh hlayer_act = sigmoid(hinp)
outinp1=np.dot(hlayer_act,wout) outinp= outinp1+ bout
output = sigmoid(outinp)
#Backpropagation EO = y-output
outgrad = derivatives_sigmoid(output) d_output = EO* outgrad
EH = d_output.dot(wout.T)
hiddengrad = derivatives_sigmoid(hlayer_act)#how much hidden layer wts contributed to error
d_hiddenlayer = EH * hiddengrad
wout += hlayer_act.T.dot(d_output) *lr# dotproduct of nextlayererror and currentlayerop
# bout += np.sum(d_output, axis=0,keepdims=True) *lr wh += X.T.dot(d_hiddenlayer) *lr
#bh += np.sum(d_hiddenlayer, axis=0,keepdims=True) *lr print("Input: \n" + str(X))
print("Actual Output: \n" + str(y)) print("Predicted Output: \n" ,output)
output
Input:
[[ 0.66666667 1. ]
[ 0.33333333 0.55555556]
[ 1. 0.66666667]]
Actual Output: [[ 0.92]
[ 0.86]
[ 0.89]]
Predicted Output: [[ 0.89559591]
[ 0.88142069]
[ 0.8928407 ]]
RESULT:
EX NO: 3
MULTILAYER PERCEPTRON
DATE:
AIM:
To write a program for implementing the classification using Multilayer perceptron.
Source Code:
! pip install res-mlp-pytorch
! pip install torch-optimizer
import numpy as np
import pandas as pd
import os
import copy
import time
import torch
import torch.nn as nn
import cv2
import matplotlib.pyplot as plt
import copy
import time
import albumentations as A
import torch_optimizer as optim
from res_mlp_pytorch import ResMLP
from PIL import Image
from albumentations.pytorch import ToTensorV2
from torch.utils.data import Dataset, DataLoader
class FoodDataset(Dataset):
def __init__(self, data_type=None, transforms=None):
self.path = '../input/food5k/Food-5K/' + data_type + '/'
self.images_name = os.listdir(self.path)
self.transforms = transforms
def __len__(self):
return len(self.images_name)
def __getitem__(self, idx):
data = self.images_name[idx]
label = data.split('_')[0]
label = int(label)
label = torch.tensor(label)
image = cv2.imread(self.path + data)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if self.transforms:
aug = self.transforms(image=image)
image = aug['image']
return (image, label)
train_data = FoodDataset('training',
A.Compose([
A.RandomResizedCrop(256, 256),
A.HorizontalFlip(),
A.Normalize(),
ToTensorV2()
]))
val_data = FoodDataset('validation',
A.Compose([
A.Resize(384, 384),
A.CenterCrop(256, 256),
A.Normalize(),
ToTensorV2(),
]))
test_data = FoodDataset('evaluation',
A.Compose([
A.Resize(384, 384),
A.CenterCrop(256, 256),
A.Normalize(),
ToTensorV2(),
]))
dataloaders = {
'train': DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4),
'val': DataLoader(val_data, batch_size=32, shuffle=True, num_workers=4),
'test': DataLoader(test_data, batch_size=32, shuffle=True, num_workers=4)
}
dataset_sizes = {
'train': len(train_data),
'val': len(val_data),
'test': len(test_data)
}
def train_model(model, criterion, optimizer, epochs=1):
since = 0.0
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = 0.0
best_acc = 0
for ep in range(epochs):
print(f"Epoch {ep}/{epochs-1}")
print("-"*10)
for phase in ['train', 'val']:
if phase == 'train':
model.train()
else:
model.eval()
running_loss = 0.0
running_corrects = 0
for images, labels in dataloaders[phase]:
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
outputs = model(images)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
if phase == 'train':
loss.backward()
optimizer.step()
running_loss += loss.item() * images.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print(f"{phase} Loss:{epoch_loss:.4f} Acc:{epoch_acc:.4f}")
if phase == 'val':
if ep == 0:
best_loss = epoch_loss
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
else:
if epoch_loss < best_loss:
best_loss = epoch_loss
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print(f'Training complete in {time_elapsed // 60}m {time_elapsed % 60}s')
print(f'Best val loss: {best_loss:.4f}')
print(f'Best acc: {best_acc}')
model.load_state_dict(best_model_wts)
return model
# Train The Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ResMLP(image_size=256, patch_size=16, dim=512, depth=12, num_classes=2)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Lamb(model.parameters(), lr=0.005, weight_decay=0.2)
best_model = train_model(model, criterion, optimizer, epochs=20)
Output
# ResMLP
Best val loss: 0.2860
Best acc: 0.891
# ResNet-18
Best val loss: 0.0365
Best acc: 0.986
# ResNet-50
Best val loss: 0.0245
Best acc: 0.993
RESULT:
EX NO: 4
NAÏVE BAYESIAN CLASSIFIER
DATE:
AIM
To write a program to implement the naïve Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.
Source Code:
import csv import random import math
def loadCsv(filename):
lines = csv.reader(open(filename, "r")); dataset = list(lines)
for i in range(len(dataset)):
#converting strings into numbers for processing dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio): #67% training size
trainSize = int(len(dataset) * splitRatio); trainSet = []
copy = list(dataset);
while len(trainSet) < trainSize:
#generate indices for the dataset list randomly to pick ele for training data index = random.randrange(len(copy)); trainSet.append(copy.pop(index))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
#creates a dictionary of classes 1 and 0 where the values are the instacnes belonging to each class
for i in range(len(dataset)): vector = dataset[i]
if (vector[-1] not in separated): separated[vector[-1]] = []
separated[vector[-1]].append(vector) return separated
def mean(numbers):
return sum(numbers)/float(len(numbers))
def stdev(numbers):
avg = mean(numbers)
variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1) return math.sqrt(variance)
def summarize(dataset):
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]; del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset); summaries = {}
for classValue, instances in separated.items():
#summaries is a dic of tuples(mean,std) for each class value summaries[classValue] = summarize(instances)
return summaries
def calculateProbability(x, mean, stdev):
exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in summaries.items():#class and attribute information as mean and sd
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i] #take mean and sd of every attribute for class 0 and 1 seperaely
x = inputVector[i] #testvector's first attribute probabilities[classValue] *= calculateProbability(x, mean, stdev);#use
normal dist
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector) bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():#assigns that class which has he
highest prob
if bestLabel is None or probability > bestProb: bestProb = probability
bestLabel = classValue return bestLabel
def getPredictions(summaries, testSet): predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i]) predictions.append(result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]: correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
filename = '5data.csv' splitRatio = 0.67
dataset = loadCsv(filename);
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
# prepare model
summaries = summarizeByClass(trainingSet); # test model
predictions = getPredictions(summaries, testSet) accuracy = getAccuracy(testSet, predictions)
print('Accuracy of the classifier is : {0}%'.format(accuracy)) main()
Output
confusion matrix is as follows [[17 0 0]
[ 0 17 0]
[ 0 0 11]]
Accuracy metrics
precision recall f1-score support
0 1.00 1.00 1.00
17
1 1.00 1.00 1.00
17
2 1.00 1.00 1.00
11
avg / total
1.00
1.00
1.00 45
RESULT:
EX NO: 5
NAÏVE BAYESIAN CLASSIFIER USING BUILT-IN FUNCTIONS
DATE:
AIM:
Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to perform this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for your data set.
Source Code:
import pandas as pd msg=pd.read_csv('naivetext1.csv',names=['message','label']) print('The dimensions of the dataset',msg.shape) msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message y=msg.labelnum print(X)
print(y)
#splitting the dataset into train and test data
from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(X,y) print(xtest.shape)
print(xtrain.shape) print(ytest.shape) print(ytrain.shape)
#output of count vectoriser is a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain) xtest_dtm=count_vect.transform(xtest) print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names()) print(df)#tabular representation
print(xtrain_dtm) #sparse matrix representation
# Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain) predicted = clf.predict(xtest_dtm)
#printing accuracy metrics from sklearn import metrics print('Accuracy metrics')
print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted)) print('Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('Recall and Precison ') print(metrics.recall_score(ytest,predicted)) print(metrics.precision_score(ytest,predicted))
'''docs_new = ['I like this place', 'My boss is not my saviour']
X_new_counts = count_vect.transform(docs_new) predictednew = clf.predict(X_new_counts)
for doc, category in zip(docs_new, predictednew):
print('%s->%s' % (doc, msg.labelnum[category]))'''
I love this sandwich,pos This is an amazing place,pos
I feel very good about these beers,pos This is my best work,pos
What an awesome view,pos
I do not like this restaurant,neg I am tired of this stuff,neg
I can't deal with this,neg He is my sworn enemy,neg My boss is horrible,neg
This is an awesome place,pos
I do not like the taste of this juice,neg I love to dance,pos
I am sick and tired of this place,neg What a great holiday,pos
That is a bad locality to stay,neg
We will have good fun tomorrow,pos I went to my enemy's house today,neg
OUTPUT
['about', 'am', 'amazing', 'an', 'and', 'awesome', 'beers', 'best', 'boss', 'can', 'deal',
'do', 'enemy', 'feel', 'fun', 'good', 'have', 'horrible', 'house', 'is', 'like', 'love', 'my',
'not', 'of', 'place', 'restaurant', 'sandwich', 'sick', 'stuff', 'these', 'this', 'tired', 'to',
'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with', 'work'] about am amazing an and awesome beers best boss can ... today \
0
1 0 0 0 0 0 1
0 0 0 ... 0
1
0 0 0 0 0 0 0
1 0 0 ... 0
2
0 0 1 1 0 0
0 0 0 0 ... 0
3
0 0 0 0 0 0 0
0 0 0 ... 1
4
0 0 0 0 0 0 0
0 0 0 ... 0
5
0 1 0 0 1
0 0 0 0 0 ... 0
6
0 0 0 0 0 0 0
0 0 1 ... 0
7
0 0 0 0 0 0 0
0 0 0 ... 0
8
0 1 0 0 0 0 0
0 0 0 ... 0
9
0 0 0 1 0 1 0
0 0 0 ... 0
10 0 0
0 0 0 0 0 0 0 0 ... 0
11 0 0
0 0 0 0 0 0 1 0 ... 0
12 0 0
0 1 0 1 0 0 0 0 ... 0
tomorrow very view we went what will with work 0 0 1 0 0 0 0 0 0 0
1
0
0
0
0
0 0
0
0
1
2
0
0
0
0
0 0
0
0
0
3
0
0
0
0
1 0
0
0
0
4
0
0
0
0
0 0
0
0
0
5
0
0
0
0
0 0
0
0
0
6
0
0
0
0
0 0
0
1
0
7
1
0
0
1
0 0
1
0
0
8
0
0
0
0
0 0
0
0
0
RESULT:
EX NO: 6
NAÏVE BAYESIAN CLASSIFIER
DATE:
AIM:
To write a program to construct a Bayesian network considering medical data. Use this model to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can use Java/Python ML library classes/API.
Source Code:
import bayespy as bp
import numpy as np
import csv
from colorama import init
from colorama import Fore, Back, Style
init()
# Define Parameter Enum values
#Age
ageEnum = {'SuperSeniorCitizen':0, 'SeniorCitizen':1, 'MiddleAged':2, 'Youth':3,
'Teen':4}
# Gender
genderEnum = {'Male':0, 'Female':1}
# FamilyHistory
familyHistoryEnum = {'Yes':0, 'No':1}
# Diet(Calorie Intake)
dietEnum = {'High':0, 'Medium':1, 'Low':2}
# LifeStyle
lifeStyleEnum = {'Athlete':0, 'Active':1, 'Moderate':2, 'Sedetary':3}
# Cholesterol
cholesterolEnum = {'High':0, 'BorderLine':1, 'Normal':2}
# HeartDisease
heartDiseaseEnum = {'Yes':0, 'No':1}
#heart_disease_data.csv
with open('heart_disease_data.csv') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
data = []
for x in dataset:
data.append([ageEnum[x[0]],genderEnum[x[1]],familyHistoryEnum[x[2]],dietEnum[x[
3]],lifeStyleEnum[x[4]],cholesterolEnum[x[5]],heartDiseaseEnum[x[6]]])
# Training data for machine learning todo: should import from csv
data = np.array(data)
N = len(data)
# Input data column assignment
p_age = bp.nodes.Dirichlet(1.0*np.ones(5))
age = bp.nodes.Categorical(p_age, plates=(N,))
age.observe(data[:,0])
p_gender = bp.nodes.Dirichlet(1.0*np.ones(2))
gender = bp.nodes.Categorical(p_gender, plates=(N,))
gender.observe(data[:,1])
p_familyhistory = bp.nodes.Dirichlet(1.0*np.ones(2))
familyhistory = bp.nodes.Categorical(p_familyhistory, plates=(N,))
familyhistory.observe(data[:,2])
p_diet = bp.nodes.Dirichlet(1.0*np.ones(3))
diet = bp.nodes.Categorical(p_diet, plates=(N,))
diet.observe(data[:,3])
p_lifestyle = bp.nodes.Dirichlet(1.0*np.ones(4))
lifestyle = bp.nodes.Categorical(p_lifestyle, plates=(N,))
lifestyle.observe(data[:,4])
p_cholesterol = bp.nodes.Dirichlet(1.0*np.ones(3))
cholesterol = bp.nodes.Categorical(p_cholesterol, plates=(N,))
cholesterol.observe(data[:,5])
# Prepare nodes and establish edges
# np.ones(2) -> HeartDisease has 2 options Yes/No
# plates(5, 2, 2, 3, 4, 3) -> corresponds to options present for domain values
p_heartdisease = bp.nodes.Dirichlet(np.ones(2), plates=(5, 2, 2, 3, 4, 3))
heartdisease = bp.nodes.MultiMixture([age, gender, familyhistory, diet, lifestyle,
cholesterol], bp.nodes.Categorical, p_heartdisease)
heartdisease.observe(data[:,6])
p_heartdisease.update()
# Sample Test with hardcoded values
#print("Sample Probability")
#print("Probability(HeartDisease|Age=SuperSeniorCitizen, Gender=Female,
FamilyHistory=Yes, DietIntake=Medium, LifeStyle=Sedetary, Cholesterol=High)")
#print(bp.nodes.MultiMixture([ageEnum['SuperSeniorCitizen'], genderEnum['Female'],
familyHistoryEnum['Yes'], dietEnum['Medium'], lifeStyleEnum['Sedetary'],
cholesterolEnum['High']], bp.nodes.Categorical, p_heartdisease).get_moments()[0]
[heartDiseaseEnum['Yes']])
# Interactive Test
m = 0
while m == 0:
print("\n")
res = bp.nodes.MultiMixture([int(input('Enter Age: ' + str(ageEnum))),
int(input('Enter Gender: ' + str(genderEnum))), int(input('Enter FamilyHistory: ' +
str(familyHistoryEnum))), int(input('Enter dietEnum: ' + str(dietEnum))),
int(input('Enter LifeStyle: ' + str(lifeStyleEnum))), int(input('Enter Cholesterol: ' +
str(cholesterolEnum)))], bp.nodes.Categorical, p_heartdisease).get_moments()[0]
[heartDiseaseEnum['Yes']]
print("Probability(HeartDisease) = " + str(res))
#print(Style.RESET_ALL)
m = int(input("Enter for Continue:0, Exit :1 "))
OUTPUT:
Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}0
Enter Gender: {'Male': 0, 'Female': 1}0
Enter FamilyHistory: {'Yes': 0, 'No': 1}0
Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}0
Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}0
Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}0
Probability(HeartDisease) = 0.5
Enter for Continue:0, Exit :1 0
Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}4
Enter Gender: {'Male': 0, 'Female': 1}0
Enter FamilyHistory: {'Yes': 0, 'No': 1}0
Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}1
Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}3
Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}2
Probability(HeartDisease) = 0.13784165696493575
Enter for Continue:0, Exit :1 0
Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}3
Enter Gender: {'Male': 0, 'Female': 1}1
Enter FamilyHistory: {'Yes': 0, 'No': 1}0
Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}1
Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}0
Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}1
Probability(HeartDisease) = 0.2689414213699951
Enter for Continue:0, Exit :1
EX NO: 7
EM ALGORITHM
DATE:
AIM:
To apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set for clustering using k-Means algorithm. Compare the results of these two algorithms and comment on the quality of clustering. You can add Java/Python ML library classes/API in the program.
Source Code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs X, y_true = make_blobs(n_samples=100, centers = 4,Cluster_std=0.60,random_state=0)
X = X[:, ::-1]
#flip axes for better plotting
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture (n_components = 4).fit(X) lables = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap=‟viridis‟); probs = gmm.predict_proba(X)
print(probs[:5].round(3))
size = 50 * probs.max(1) ** 2 # square emphasizes differences plt.scatter(X[:, 0], X[:, 1], c=labels, cmap=‟viridis‟, s=size);
from matplotlib.patches import Ellipse
def draw_ellipse(position, covariance, ax=None, **kwargs); “””Draw an ellipse with a given position and covariance”””
Ax = ax or plt.gca()
# Convert covariance to principal axes
if covariance.shape ==(2,2):
U, s, Vt = np.linalg.svd(covariance)
Angle = np.degrees(np.arctan2(U[1, 0], U[0,0])) Width, height = 2 * np.sqrt(s)
else:
angle = 0
width, height = 2 * np.sqrt(covariance)
#Draw the Ellipse
for nsig in range(1,4):
ax.add_patch(Ellipse(position, nsig * width, nsig *height, angle, **kwargs))
def plot_gmm(gmm, X, label=True, ax=None): ax = ax or plt.gca()
labels = gmm.fit(X).predict(X) if label:
ax.scatter(X[:, 0], x[:, 1], c=labels, s=40, cmap=‟viridis‟, zorder=2) else:
ax.scatter(X[:, 0], x[:, 1], s=40, zorder=2) ax.axis(„equal‟)
w_factor = 0.2 / gmm.weights_.max()
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): draw_ellipse(pos, covar, alpha=w * w_factor)
gmm = GaussianMixture(n_components=4, random_state=42) plot_gmm(gmm, X)
gmm = GaussianMixture(n_components=4, covariance_type=‟full‟, random_state=42)
plot_gmm(gmm, X)
Output
[[1 ,0, 0, 0]
[0 ,0, 1, 0]
[1 ,0, 0, 0]
[1 ,0, 0, 0]
[1 ,0, 0, 0]]
K-means
from sklearn.cluster import KMeans
#from sklearn import metrics import numpy as np
import matplotlib.pyplot as plt import pandas as pd data=pd.read_csv("kmeansdata.csv") df1=pd.DataFrame(data)
print(df1)
f1 = df1['Distance_Feature'].values f2 = df1['Speeding_Feature'].values
X=np.matrix(list(zip(f1,f2))) plt.plot()
plt.xlim([0, 100])
plt.ylim([0, 50]) plt.title('Dataset') plt.ylabel('speeding_feature') plt.xlabel('Distance_Feature') plt.scatter(f1,f2)
plt.show()
# create new plot and data plt.plot()
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
# KMeans algorithm #K = 3
kmeans_model = KMeans(n_clusters=3).fit(X)
plt.plot()
for i, l in enumerate(kmeans_model.labels_):
plt.plot(f1[i], f2[i], color=colors[l], marker=markers[l],ls='None') plt.xlim([0, 100])
plt.ylim([0, 50]) plt.show()
Driver_ID,Distance_Feature,Speeding_Feature
3423311935,71.24,28
3423313212,52.53,25
3423313724,64.54,27
3423311373,55.69,22
3423310999,54.58,25
3423313857,41.91,10
3423312432,58.64,20
3423311434,52.02,8
3423311328,31.25,34
3423312488,44.31,19
3423311254,49.35,40
3423312943,58.07,45
3423312536,44.22,22
3423311542,55.73,19
3423312176,46.63,43
3423314176,52.97,32
3423314202,46.25,35
3423311346,51.55,27
3423310666,57.05,26
3423313527,58.45,30
3423312182,43.42,23
3423313590,55.68,37
3423312268,55.15,18
RESULT:
EX NO: 8
PRINCIPLE COMPONENT ANALYSIS FOR DIMENSIONALITY REDUCTION
DATE:
AIM:
To write a program to implement Principle Component Analysis for Dimensionality Reduction.
Source Code:
# define transform
pca = PCA()
# prepare transform on dataset
pca.fit(data)
# apply transform to dataset
transformed = pca.transform(data)
# define the pipeline
steps = [('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# define the pipeline
steps = [('norm', MinMaxScaler()), ('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# test classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# summarize the dataset
print(X.shape, y.shape)
(1000, 20) (1000,)
# evaluate pca with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# evaluate pca with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the pipeline
steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Accuracy: 0.816 (0.034)
# compare pca number of components with logistic regression algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
# get the dataset
def get_dataset():
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
return X, y
# get a list of models to evaluate
def get_models():
models = dict()
for i in range(1,21):
steps = [('pca', PCA(n_components=i)), ('m', LogisticRegression())]
models[str(i)] = Pipeline(steps=steps)
return models
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# define dataset
X, y = get_dataset()
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X, y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.xticks(rotation=45)
pyplot.show()
Output:
>1 0.542 (0.048)
>2 0.713 (0.048)
>3 0.720 (0.053)
>4 0.723 (0.051)
>5 0.725 (0.052)
>6 0.730 (0.046)
>7 0.805 (0.036)
>8 0.800 (0.037)
>9 0.814 (0.036)
>10 0.816 (0.034)
>11 0.819 (0.035)
>12 0.819 (0.038)
>13 0.819 (0.035)
>14 0.853 (0.029)
>15 0.865 (0.027)
>16 0.865 (0.027)
>17 0.865 (0.027)
>18 0.865 (0.027)
>19 0.865 (0.027)
>20 0.865 (0.027)
RESULT:
EX NO: 8
K-NEAREST NEIGHBOUR ALGORITHM
DATE:
AIM:
To write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print both correct and wrong predictions. Java/Python ML library classes can be used for this problem.
Source Code:
import csv import random import math import operator
def loadDataset(filename, split, trainingSet=[] , testSet=[]): with open(filename, 'rb') as csvfile:
lines = csv.reader(csvfile) dataset = list(lines)
for x in range(len(dataset)-1): for y in range(4):
dataset[x][y] = float(dataset[x][y]) if random.random() < split:
trainingSet.append(dataset[x]) else:
testSet.append(dataset[x])
def euclideanDistance(instance1, instance2, length): distance = 0
for x in range(length):
distance += pow((instance1[x] - instance2[x]), 2) return math.sqrt(distance)
def getNeighbors(trainingSet, testInstance, k): distances = []
length = len(testInstance)-1
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x], dist))
distances.sort(key=operator.itemgetter(1)) neighbors = []
for x in range(k):
neighbors.append(distances[x][0]) return neighbors
def
getResponse(neighbors): classVotes = {}
for x in range(len(neighbors)): response = neighbors[x][-1] if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes =
sorted(classVotes.iteritems(),
reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions): correct = 0 for x in range(len(testSet)): key=operator.itemgetter(1
),
if testSet[x][-1] == predictions[x]: correct += 1
return (correct/float(len(testSet))) * 100.0
def main():
# prepare data trainingSet= [] testSet=[] split = 0.67
loadDataset('knndat.data', split, trainingSet, testSet) print('Train set: ' + repr(len(trainingSet))) print('Test set: ' + repr(len(testSet)))
# generate predictions predictions=[] k=3
for x in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[x],
k) result = getResponse(neighbors) predictions.append(result)
print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][- 1])) accuracy = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(accuracy) + '%') main()
OUTPUT
Confusion matrix is as follows
[[11 0 0]
[0 9 1]
[0 1 8]]
Accuracy metrics 0 1.00 1.00 1.00 11
1 0.90 0.90 0.90 10
2 0.89 0.89 0,89 9
Avg/Total 0.93 0.93 0.93 30
RESULT:
EX NO: 8
K-NEAREST NEIGHBOUR ALGORITHM
DATE:
AIM:
To implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select appropriate data set for your experiment and draw graphs.
Source Code:
from numpy import * import operator
from os import listdir import matplotlib
import matplotlib.pyplot as plt import pandas as pd
import numpy as np1 import numpy.linalg as np
from scipy.stats.stats import pearsonr
def kernel(point,xmat, k): m,n = np1.shape(xmat)
weights = np1.mat(np1.eye((m))) for j in range(m):
diff = point - X[j]
weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2)) return weights
def localWeight(point,xmat,ymat,k): wei = kernel(point,xmat,k)
W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T)) return W
def localWeightRegression(xmat,ymat,k): m,n = np1.shape(xmat)
ypred = np1.zeros(m) for i in range(m):
ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k) return ypred
# load data points
data = pd.read_csv('data10.csv') bill = np1.array(data.total_bill) tip = np1.array(data.tip)
#preparing and add 1 in bill mbill = np1.mat(bill)
mtip = np1.mat(tip)
m= np1.shape(mbill)[1]
one = np1.mat(np1.ones(m))
X= np1.hstack((one.T,mbill.T))
#set k here
ypred = localWeightRegression(X,mtip,2)
SortIndex = X[:,1].argsort(0) xsort = X[SortIndex][:,0]
Output