Wednesday, November 10, 2021

IV CSE 3 ML LAB 2021

EX NO: 1

DECISION TREE BASED ID3 ALGORITHM

DATE:

AIM:

To write a program to demonstrate the working of the decision tree based ID3 algorithm. Use an appropriate data set for building the decision tree and apply this knowledge to classify a new sample.

Source Code:

import numpy as np import math

from data_loader import read_data

class Node:

def init (self, attribute): self.attribute = attribute self.children = [] self.answer = ""

def str (self): return self.attribute

def subtables(data, col, delete): dict = {}

items = np.unique(data[:, col])

count = np.zeros((items.shape[0], 1), dtype=np.int32) for x in range(items.shape[0]):

for y in range(data.shape[0]):

if data[y, col] == items[x]: count[x] += 1

for x in range(items.shape[0]):

dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")

pos = 0

for y in range(data.shape[0]): if data[y, col] == items[x]:

dict[items[x]][pos] = data[y] pos += 1

if delete:

dict[items[x]] = np.delete(dict[items[x]], col, 1) return items, dict

def entropy(S):

Iitems = np.unique(S) if items.size == 1:

return 0

counts = np.zeros((items.shape[0], 1)) sums = 0

for x in range(items.shape[0]):

counts[x] = sum(S == items[x]) / (S.size * 1.0)

for count in counts:

sums += -1 * count * math.log(count, 2) return sums

def gain_ratio(data, col):

items, dict = subtables(data, col, delete=False)

total_size = data.shape[0]

entropies = np.zeros((items.shape[0], 1)) intrinsic = np.zeros((items.shape[0], 1)) for x in range(items.shape[0]):

ratio = dict[items[x]].shape[0]/(total_size * 1.0) entropies[x] = ratio * entropy(dict[items[x]][:, -1]) intrinsic[x] = ratio * math.log(ratio, 2)

total_entropy = entropy(data[:, -1]) iv = -1 * sum(intrinsic)

for x in range(entropies.shape[0]): total_entropy -= entropies[x]

return total_entropy / iv

def create_node(data, metadata):

if (np.unique(data[:, -1])).shape[0] == 1: node = Node("")

node.answer = np.unique(data[:, -1])[0] return node

gains = np.zeros((data.shape[1] - 1, 1)) for col in range(data.shape[1] - 1):

gains[col] = gain_ratio(data, col) split = np.argmax(gains)

node = Node(metadata[split])

metadata = np.delete(metadata, split, 0)

items, dict = subtables(data, split, delete=True)

for x in range(items.shape[0]):

child = create_node(dict[items[x]], metadata) node.children.append((items[x], child))

return node def empty(size):

s = ""

for x in range(size): s += " "

return s

def print_tree(node, level): if node.answer != "":

print(empty(level), node.answer) return

print(empty(level), node.attribute) for value, n in node.children:

print(empty(level + 1), value) print_tree(n, level + 2)

metadata, traindata = read_data("tennis.csv") data = np.array(traindata)

node = create_node(data, metadata) print_tree(node, 0)

Data_loader.py

import csv

def read_data(filename):

with open(filename, 'r') as csvfile:

datareader = csv.reader(csvfile, delimiter=',') headers = next(datareader)

metadata = [] traindata = []

for name in headers: metadata.append(name)

for row in datareader: traindata.append(row)

return (metadata, traindata)

Tennis.csv

outlook,temperature,humidity,wind, answer sunny,hot,high,weak,no sunny,hot,high,strong,no overcast,hot,high,weak,yes rain,mild,high,weak,yes rain,cool,normal,weak,yes rain,cool,normal,strong,no overcast,cool,normal,strong,yes sunny,mild,high,weak,no sunny,cool,normal,weak,yes rain,mild,normal,weak,yes sunny,mild,normal,strong,yes overcast,mild,high,strong,yes overcast,hot,normal,weak,yes rain,mild,high,strong,no

Output

outlook

overcast b'yes'

rain

wind

b'strong' b'no' b'weak' b'yes'

sunny

humidity b'high' b'no'

b'normal' b'yes

RESULT:



EX NO: 2

BACK PROPAGATION ALGORITHM


AIM:

To write a program for implementing the Back propagation algorithm and test the same using appropriate data sets.

Source Code:

import numpy as np

X = np.array(([2, 9], [1, 5], [3, 6]), dtype=float)

y = np.array(([92], [86], [89]), dtype=float)

X = X/np.amax(X,axis=0) # maximum of X array longitudinally y = y/100

#Sigmoid Function def sigmoid (x):

return 1/(1 + np.exp(-x))

#Derivative of Sigmoid Function def derivatives_sigmoid(x):

return x * (1 - x)

#Variable initialization

epoch=7000 #Setting training iterations lr=0.1 #Setting learning rate

inputlayer_neurons = 2 #number of features in data set hiddenlayer_neurons = 3 #number of hidden layers neurons output_neurons = 1 #number of neurons at output layer #weight and bias initialization

wh=np.random.uniform(size=(inputlayer_neurons,hiddenlayer_neurons)) bh=np.random.uniform(size=(1,hiddenlayer_neurons)) wout=np.random.uniform(size=(hiddenlayer_neurons,output_neurons)) bout=np.random.uniform(size=(1,output_neurons))

#draws a random range of numbers uniformly of dim x*y for i in range(epoch):

#Forward Propogation hinp1=np.dot(X,wh) hinp=hinp1 + bh hlayer_act = sigmoid(hinp)

outinp1=np.dot(hlayer_act,wout) outinp= outinp1+ bout

output = sigmoid(outinp)

#Backpropagation EO = y-output

outgrad = derivatives_sigmoid(output) d_output = EO* outgrad

EH = d_output.dot(wout.T)

hiddengrad = derivatives_sigmoid(hlayer_act)#how much hidden layer wts contributed to error

d_hiddenlayer = EH * hiddengrad

wout += hlayer_act.T.dot(d_output) *lr# dotproduct of nextlayererror and currentlayerop

# bout += np.sum(d_output, axis=0,keepdims=True) *lr wh += X.T.dot(d_hiddenlayer) *lr

#bh += np.sum(d_hiddenlayer, axis=0,keepdims=True) *lr print("Input: \n" + str(X))

print("Actual Output: \n" + str(y)) print("Predicted Output: \n" ,output)

output

Input:

[[ 0.66666667 1. ]

[ 0.33333333 0.55555556]

[ 1. 0.66666667]]

Actual Output: [[ 0.92]

[ 0.86]

[ 0.89]]

Predicted Output: [[ 0.89559591]

[ 0.88142069]

[ 0.8928407 ]]

RESULT:



EX NO: 3

MULTILAYER PERCEPTRON

DATE:

AIM:

To write a program for implementing the classification using Multilayer perceptron.

Source Code:

! pip install res-mlp-pytorch

! pip install torch-optimizer

import numpy as np

import pandas as pd

import os

import copy

import time

import torch

import torch.nn as nn

import cv2

import matplotlib.pyplot as plt

import copy

import time

import albumentations as A

import torch_optimizer as optim

from res_mlp_pytorch import ResMLP

from PIL import Image

from albumentations.pytorch import ToTensorV2

from torch.utils.data import Dataset, DataLoader

class FoodDataset(Dataset):

def __init__(self, data_type=None, transforms=None):

self.path = '../input/food5k/Food-5K/' + data_type + '/'

self.images_name = os.listdir(self.path)

self.transforms = transforms

def __len__(self):

return len(self.images_name)

def __getitem__(self, idx):

data = self.images_name[idx]

label = data.split('_')[0]

label = int(label)

label = torch.tensor(label)

image = cv2.imread(self.path + data)

image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

if self.transforms:

aug = self.transforms(image=image)

image = aug['image']

return (image, label)

train_data = FoodDataset('training',

A.Compose([

A.RandomResizedCrop(256, 256),

A.HorizontalFlip(),

A.Normalize(),

ToTensorV2()

]))

val_data = FoodDataset('validation',

A.Compose([

A.Resize(384, 384),

A.CenterCrop(256, 256),

A.Normalize(),

ToTensorV2(),

]))

test_data = FoodDataset('evaluation',

A.Compose([

A.Resize(384, 384),

A.CenterCrop(256, 256),

A.Normalize(),

ToTensorV2(),

]))

dataloaders = {

'train': DataLoader(train_data, batch_size=32, shuffle=True, num_workers=4),

'val': DataLoader(val_data, batch_size=32, shuffle=True, num_workers=4),

'test': DataLoader(test_data, batch_size=32, shuffle=True, num_workers=4)

}

dataset_sizes = {

'train': len(train_data),

'val': len(val_data),

'test': len(test_data)

}

def train_model(model, criterion, optimizer, epochs=1):

since = 0.0

best_model_wts = copy.deepcopy(model.state_dict())

best_loss = 0.0

best_acc = 0

for ep in range(epochs):

print(f"Epoch {ep}/{epochs-1}")

print("-"*10)

for phase in ['train', 'val']:

if phase == 'train':

model.train()

else:

model.eval()

running_loss = 0.0

running_corrects = 0

for images, labels in dataloaders[phase]:

images = images.to(device)

labels = labels.to(device)

optimizer.zero_grad()

with torch.set_grad_enabled(phase == 'train'):

outputs = model(images)

_, preds = torch.max(outputs, 1)

loss = criterion(outputs, labels)

if phase == 'train':

loss.backward()

optimizer.step()

running_loss += loss.item() * images.size(0)

running_corrects += torch.sum(preds == labels.data)

epoch_loss = running_loss / dataset_sizes[phase]

epoch_acc = running_corrects.double() / dataset_sizes[phase]

print(f"{phase} Loss:{epoch_loss:.4f} Acc:{epoch_acc:.4f}")

if phase == 'val':

if ep == 0:

best_loss = epoch_loss

best_acc = epoch_acc

best_model_wts = copy.deepcopy(model.state_dict())

else:

if epoch_loss < best_loss:

best_loss = epoch_loss

best_acc = epoch_acc

best_model_wts = copy.deepcopy(model.state_dict())

print()

time_elapsed = time.time() - since

print(f'Training complete in {time_elapsed // 60}m {time_elapsed % 60}s')

print(f'Best val loss: {best_loss:.4f}')

print(f'Best acc: {best_acc}')

model.load_state_dict(best_model_wts)

return model

# Train The Model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ResMLP(image_size=256, patch_size=16, dim=512, depth=12, num_classes=2)

model = model.to(device)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Lamb(model.parameters(), lr=0.005, weight_decay=0.2)

best_model = train_model(model, criterion, optimizer, epochs=20)

Output

# ResMLP

Best val loss: 0.2860

Best acc: 0.891

# ResNet-18

Best val loss: 0.0365

Best acc: 0.986

# ResNet-50

Best val loss: 0.0245

Best acc: 0.993

RESULT:


EX NO: 4

NAÏVE BAYESIAN CLASSIFIER

DATE:

AIM

To write a program to implement the naïve Bayesian classifier for a sample training data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test data sets.

Source Code:

import csv import random import math

def loadCsv(filename):

lines = csv.reader(open(filename, "r")); dataset = list(lines)

for i in range(len(dataset)):

#converting strings into numbers for processing dataset[i] = [float(x) for x in dataset[i]]

return dataset

def splitDataset(dataset, splitRatio): #67% training size

trainSize = int(len(dataset) * splitRatio); trainSet = []

copy = list(dataset);

while len(trainSet) < trainSize:

#generate indices for the dataset list randomly to pick ele for training data index = random.randrange(len(copy)); trainSet.append(copy.pop(index))

return [trainSet, copy]

def separateByClass(dataset):

separated = {}

#creates a dictionary of classes 1 and 0 where the values are the instacnes belonging to each class

for i in range(len(dataset)): vector = dataset[i]

if (vector[-1] not in separated): separated[vector[-1]] = []

separated[vector[-1]].append(vector) return separated

def mean(numbers):

return sum(numbers)/float(len(numbers))

def stdev(numbers):

avg = mean(numbers)

variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1) return math.sqrt(variance)

def summarize(dataset):

summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]; del summaries[-1]

return summaries

def summarizeByClass(dataset):

separated = separateByClass(dataset); summaries = {}

for classValue, instances in separated.items():

#summaries is a dic of tuples(mean,std) for each class value summaries[classValue] = summarize(instances)

return summaries

def calculateProbability(x, mean, stdev):

exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2)))) return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):

probabilities = {}

for classValue, classSummaries in summaries.items():#class and attribute information as mean and sd

probabilities[classValue] = 1

for i in range(len(classSummaries)):

mean, stdev = classSummaries[i] #take mean and sd of every attribute for class 0 and 1 seperaely

x = inputVector[i] #testvector's first attribute probabilities[classValue] *= calculateProbability(x, mean, stdev);#use

normal dist

return probabilities

def predict(summaries, inputVector):

probabilities = calculateClassProbabilities(summaries, inputVector) bestLabel, bestProb = None, -1

for classValue, probability in probabilities.items():#assigns that class which has he

highest prob

if bestLabel is None or probability > bestProb: bestProb = probability

bestLabel = classValue return bestLabel

def getPredictions(summaries, testSet): predictions = []

for i in range(len(testSet)):

result = predict(summaries, testSet[i]) predictions.append(result)

return predictions

def getAccuracy(testSet, predictions):

correct = 0

for i in range(len(testSet)):

if testSet[i][-1] == predictions[i]: correct += 1

return (correct/float(len(testSet))) * 100.0

def main():

filename = '5data.csv' splitRatio = 0.67

dataset = loadCsv(filename);

trainingSet, testSet = splitDataset(dataset, splitRatio)

print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))

# prepare model

summaries = summarizeByClass(trainingSet); # test model

predictions = getPredictions(summaries, testSet) accuracy = getAccuracy(testSet, predictions)

print('Accuracy of the classifier is : {0}%'.format(accuracy)) main()

Output

confusion matrix is as follows [[17 0 0]

[ 0 17 0]

[ 0 0 11]]

Accuracy metrics

precision recall f1-score support

0 1.00 1.00 1.00

17

1 1.00 1.00 1.00

17

2 1.00 1.00 1.00

11

avg / total 

1.00

1.00

1.00 45

RESULT:


EX NO: 5

NAÏVE BAYESIAN CLASSIFIER USING BUILT-IN FUNCTIONS

DATE:

AIM:

Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier model to perform this task. Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for your data set.

Source Code:

import pandas as pd msg=pd.read_csv('naivetext1.csv',names=['message','label']) print('The dimensions of the dataset',msg.shape) msg['labelnum']=msg.label.map({'pos':1,'neg':0})

X=msg.message y=msg.labelnum print(X)

print(y)

#splitting the dataset into train and test data

from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(X,y) print(xtest.shape)

print(xtrain.shape) print(ytest.shape) print(ytrain.shape)

#output of count vectoriser is a sparse matrix

from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer()

xtrain_dtm = count_vect.fit_transform(xtrain) xtest_dtm=count_vect.transform(xtest) print(count_vect.get_feature_names())

df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names()) print(df)#tabular representation

print(xtrain_dtm) #sparse matrix representation

# Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(xtrain_dtm,ytrain) predicted = clf.predict(xtest_dtm)

#printing accuracy metrics from sklearn import metrics print('Accuracy metrics')

print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted)) print('Confusion matrix')

print(metrics.confusion_matrix(ytest,predicted))

print('Recall and Precison ') print(metrics.recall_score(ytest,predicted)) print(metrics.precision_score(ytest,predicted))

'''docs_new = ['I like this place', 'My boss is not my saviour']

X_new_counts = count_vect.transform(docs_new) predictednew = clf.predict(X_new_counts)

for doc, category in zip(docs_new, predictednew):

print('%s->%s' % (doc, msg.labelnum[category]))'''

I love this sandwich,pos This is an amazing place,pos

I feel very good about these beers,pos This is my best work,pos

What an awesome view,pos

I do not like this restaurant,neg I am tired of this stuff,neg

I can't deal with this,neg He is my sworn enemy,neg My boss is horrible,neg

This is an awesome place,pos

I do not like the taste of this juice,neg I love to dance,pos

I am sick and tired of this place,neg What a great holiday,pos

That is a bad locality to stay,neg

We will have good fun tomorrow,pos I went to my enemy's house today,neg

OUTPUT

['about', 'am', 'amazing', 'an', 'and', 'awesome', 'beers', 'best', 'boss', 'can', 'deal',

'do', 'enemy', 'feel', 'fun', 'good', 'have', 'horrible', 'house', 'is', 'like', 'love', 'my',

'not', 'of', 'place', 'restaurant', 'sandwich', 'sick', 'stuff', 'these', 'this', 'tired', 'to',

'today', 'tomorrow', 'very', 'view', 'we', 'went', 'what', 'will', 'with', 'work'] about am amazing an and awesome beers best boss can ... today \

0

1 0 0 0 0 0 1

0 0 0 ... 0

1

0 0 0 0 0 0 0

1 0 0 ... 0

2

0 0 1 1 0 0

0 0 0 0 ... 0

3

0 0 0 0 0 0 0

0 0 0 ... 1

4

0 0 0 0 0 0 0

0 0 0 ... 0

5

0 1 0 0 1

0 0 0 0 0 ... 0

6

0 0 0 0 0 0 0

0 0 1 ... 0

7

0 0 0 0 0 0 0

0 0 0 ... 0

8

0 1 0 0 0 0 0

0 0 0 ... 0

9

0 0 0 1 0 1 0

0 0 0 ... 0

10 0 0

0 0 0 0 0 0 0 0 ... 0

11 0 0

0 0 0 0 0 0 1 0 ... 0

12 0 0

0 1 0 1 0 0 0 0 ... 0

tomorrow very view we went what will with work 0 0 1 0 0 0 0 0 0 0

1

0

0

0

0

0 0

0

0

1

2

0

0

0

0

0 0

0

0

0

3

0

0

0

0

1 0

0

0

0

4

0

0

0

0

0 0

0

0

0

5

0

0

0

0

0 0

0

0

0

6

0

0

0

0

0 0

0

1

0

7

1

0

0

1

0 0

1

0

0

8

0

0

0

0

0 0

0

0

0

RESULT:


EX NO: 6

NAÏVE BAYESIAN CLASSIFIER

DATE:

AIM:

To write a program to construct a Bayesian network considering medical data. Use this model to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set. You can use Java/Python ML library classes/API.

Source Code:

import bayespy as bp

import numpy as np

import csv

from colorama import init

from colorama import Fore, Back, Style

init()

# Define Parameter Enum values

#Age

ageEnum = {'SuperSeniorCitizen':0, 'SeniorCitizen':1, 'MiddleAged':2, 'Youth':3,

'Teen':4}

# Gender

genderEnum = {'Male':0, 'Female':1}

# FamilyHistory

familyHistoryEnum = {'Yes':0, 'No':1}

# Diet(Calorie Intake)

dietEnum = {'High':0, 'Medium':1, 'Low':2}

# LifeStyle

lifeStyleEnum = {'Athlete':0, 'Active':1, 'Moderate':2, 'Sedetary':3}

# Cholesterol

cholesterolEnum = {'High':0, 'BorderLine':1, 'Normal':2}

# HeartDisease

heartDiseaseEnum = {'Yes':0, 'No':1}

#heart_disease_data.csv

with open('heart_disease_data.csv') as csvfile:

lines = csv.reader(csvfile)

dataset = list(lines)

data = []

for x in dataset:

data.append([ageEnum[x[0]],genderEnum[x[1]],familyHistoryEnum[x[2]],dietEnum[x[

3]],lifeStyleEnum[x[4]],cholesterolEnum[x[5]],heartDiseaseEnum[x[6]]])

# Training data for machine learning todo: should import from csv

data = np.array(data)

N = len(data)

# Input data column assignment

p_age = bp.nodes.Dirichlet(1.0*np.ones(5))

age = bp.nodes.Categorical(p_age, plates=(N,))

age.observe(data[:,0])

p_gender = bp.nodes.Dirichlet(1.0*np.ones(2))

gender = bp.nodes.Categorical(p_gender, plates=(N,))

gender.observe(data[:,1])

p_familyhistory = bp.nodes.Dirichlet(1.0*np.ones(2))

familyhistory = bp.nodes.Categorical(p_familyhistory, plates=(N,))

familyhistory.observe(data[:,2])

p_diet = bp.nodes.Dirichlet(1.0*np.ones(3))

diet = bp.nodes.Categorical(p_diet, plates=(N,))

diet.observe(data[:,3])

p_lifestyle = bp.nodes.Dirichlet(1.0*np.ones(4))

lifestyle = bp.nodes.Categorical(p_lifestyle, plates=(N,))

lifestyle.observe(data[:,4])

p_cholesterol = bp.nodes.Dirichlet(1.0*np.ones(3))

cholesterol = bp.nodes.Categorical(p_cholesterol, plates=(N,))

cholesterol.observe(data[:,5])

# Prepare nodes and establish edges

# np.ones(2) -> HeartDisease has 2 options Yes/No

# plates(5, 2, 2, 3, 4, 3) -> corresponds to options present for domain values

p_heartdisease = bp.nodes.Dirichlet(np.ones(2), plates=(5, 2, 2, 3, 4, 3))

heartdisease = bp.nodes.MultiMixture([age, gender, familyhistory, diet, lifestyle,

cholesterol], bp.nodes.Categorical, p_heartdisease)

heartdisease.observe(data[:,6])

p_heartdisease.update()

# Sample Test with hardcoded values

#print("Sample Probability")

#print("Probability(HeartDisease|Age=SuperSeniorCitizen, Gender=Female,

FamilyHistory=Yes, DietIntake=Medium, LifeStyle=Sedetary, Cholesterol=High)")

#print(bp.nodes.MultiMixture([ageEnum['SuperSeniorCitizen'], genderEnum['Female'],

familyHistoryEnum['Yes'], dietEnum['Medium'], lifeStyleEnum['Sedetary'],

cholesterolEnum['High']], bp.nodes.Categorical, p_heartdisease).get_moments()[0]

[heartDiseaseEnum['Yes']])

# Interactive Test

m = 0

while m == 0:

print("\n")

res = bp.nodes.MultiMixture([int(input('Enter Age: ' + str(ageEnum))),

int(input('Enter Gender: ' + str(genderEnum))), int(input('Enter FamilyHistory: ' +

str(familyHistoryEnum))), int(input('Enter dietEnum: ' + str(dietEnum))),

int(input('Enter LifeStyle: ' + str(lifeStyleEnum))), int(input('Enter Cholesterol: ' +

str(cholesterolEnum)))], bp.nodes.Categorical, p_heartdisease).get_moments()[0]

[heartDiseaseEnum['Yes']]

print("Probability(HeartDisease) = " + str(res))

#print(Style.RESET_ALL)

m = int(input("Enter for Continue:0, Exit :1 "))

OUTPUT:

Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}0

Enter Gender: {'Male': 0, 'Female': 1}0

Enter FamilyHistory: {'Yes': 0, 'No': 1}0

Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}0

Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}0

Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}0

Probability(HeartDisease) = 0.5

Enter for Continue:0, Exit :1 0

Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}4

Enter Gender: {'Male': 0, 'Female': 1}0

Enter FamilyHistory: {'Yes': 0, 'No': 1}0

Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}1

Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}3

Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}2

Probability(HeartDisease) = 0.13784165696493575

Enter for Continue:0, Exit :1 0

Enter Age: {'SuperSeniorCitizen': 0, 'SeniorCitizen': 1, 'MiddleAged': 2, 'Youth': 3, 'Teen': 4}3

Enter Gender: {'Male': 0, 'Female': 1}1

Enter FamilyHistory: {'Yes': 0, 'No': 1}0

Enter dietEnum: {'High': 0, 'Medium': 1, 'Low': 2}1

Enter LifeStyle: {'Athlete': 0, 'Active': 1, 'Moderate': 2, 'Sedetary': 3}0

Enter Cholesterol: {'High': 0, 'BorderLine': 1, 'Normal': 2}1

Probability(HeartDisease) = 0.2689414213699951

Enter for Continue:0, Exit :1


EX NO: 7

EM ALGORITHM

DATE:

AIM:

To apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same data set for clustering using k-Means algorithm. Compare the results of these two algorithms and comment on the quality of clustering. You can add Java/Python ML library classes/API in the program.

Source Code:

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets.samples_generator import make_blobs X, y_true = make_blobs(n_samples=100, centers = 4,Cluster_std=0.60,random_state=0)

X = X[:, ::-1]

#flip axes for better plotting

from sklearn.mixture import GaussianMixture

gmm = GaussianMixture (n_components = 4).fit(X) lables = gmm.predict(X)

plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap=‟viridis‟); probs = gmm.predict_proba(X)

print(probs[:5].round(3))

size = 50 * probs.max(1) ** 2 # square emphasizes differences plt.scatter(X[:, 0], X[:, 1], c=labels, cmap=‟viridis‟, s=size);

from matplotlib.patches import Ellipse

def draw_ellipse(position, covariance, ax=None, **kwargs); “””Draw an ellipse with a given position and covariance”””

Ax = ax or plt.gca()

# Convert covariance to principal axes

if covariance.shape ==(2,2):

U, s, Vt = np.linalg.svd(covariance)

Angle = np.degrees(np.arctan2(U[1, 0], U[0,0])) Width, height = 2 * np.sqrt(s)

else:

angle = 0

width, height = 2 * np.sqrt(covariance)

#Draw the Ellipse

for nsig in range(1,4):

ax.add_patch(Ellipse(position, nsig * width, nsig *height, angle, **kwargs))

def plot_gmm(gmm, X, label=True, ax=None): ax = ax or plt.gca()

labels = gmm.fit(X).predict(X) if label:

ax.scatter(X[:, 0], x[:, 1], c=labels, s=40, cmap=‟viridis‟, zorder=2) else:

ax.scatter(X[:, 0], x[:, 1], s=40, zorder=2) ax.axis(„equal‟)

w_factor = 0.2 / gmm.weights_.max()

for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): draw_ellipse(pos, covar, alpha=w * w_factor)

gmm = GaussianMixture(n_components=4, random_state=42) plot_gmm(gmm, X)

gmm = GaussianMixture(n_components=4, covariance_type=‟full‟, random_state=42)

plot_gmm(gmm, X)

Output

[[1 ,0, 0, 0]

[0 ,0, 1, 0]

[1 ,0, 0, 0]

[1 ,0, 0, 0]

[1 ,0, 0, 0]]

K-means

from sklearn.cluster import KMeans

#from sklearn import metrics import numpy as np

import matplotlib.pyplot as plt import pandas as pd data=pd.read_csv("kmeansdata.csv") df1=pd.DataFrame(data)

print(df1)

f1 = df1['Distance_Feature'].values f2 = df1['Speeding_Feature'].values

X=np.matrix(list(zip(f1,f2))) plt.plot()

plt.xlim([0, 100])

plt.ylim([0, 50]) plt.title('Dataset') plt.ylabel('speeding_feature') plt.xlabel('Distance_Feature') plt.scatter(f1,f2)

plt.show()

# create new plot and data plt.plot()

colors = ['b', 'g', 'r']

markers = ['o', 'v', 's']

# KMeans algorithm #K = 3

kmeans_model = KMeans(n_clusters=3).fit(X)

plt.plot()

for i, l in enumerate(kmeans_model.labels_):

plt.plot(f1[i], f2[i], color=colors[l], marker=markers[l],ls='None') plt.xlim([0, 100])

plt.ylim([0, 50]) plt.show()

Driver_ID,Distance_Feature,Speeding_Feature

3423311935,71.24,28

3423313212,52.53,25

3423313724,64.54,27

3423311373,55.69,22

3423310999,54.58,25

3423313857,41.91,10

3423312432,58.64,20

3423311434,52.02,8

3423311328,31.25,34

3423312488,44.31,19

3423311254,49.35,40

3423312943,58.07,45

3423312536,44.22,22

3423311542,55.73,19

3423312176,46.63,43

3423314176,52.97,32

3423314202,46.25,35

3423311346,51.55,27

3423310666,57.05,26

3423313527,58.45,30

3423312182,43.42,23

3423313590,55.68,37

3423312268,55.15,18

RESULT:


EX NO: 8

PRINCIPLE COMPONENT ANALYSIS FOR DIMENSIONALITY REDUCTION

DATE:

AIM:

To write a program to implement Principle Component Analysis for Dimensionality Reduction.

Source Code:

# define transform

pca = PCA()

# prepare transform on dataset

pca.fit(data)

# apply transform to dataset

transformed = pca.transform(data)

# define the pipeline

steps = [('pca', PCA()), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

# define the pipeline

steps = [('norm', MinMaxScaler()), ('pca', PCA()), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

# test classification dataset

from sklearn.datasets import make_classification

# define dataset

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

# summarize the dataset

print(X.shape, y.shape)

(1000, 20) (1000,)

# evaluate pca with logistic regression algorithm for classification

from numpy import mean

from numpy import std

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

# define dataset

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

# define the pipeline

steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

# evaluate model

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# evaluate pca with logistic regression algorithm for classification

from numpy import mean

from numpy import std

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

# define dataset

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

# define the pipeline

steps = [('pca', PCA(n_components=10)), ('m', LogisticRegression())]

model = Pipeline(steps=steps)

# evaluate model

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report performance

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.816 (0.034)

# compare pca number of components with logistic regression algorithm for classification

from numpy import mean

from numpy import std

from sklearn.datasets import make_classification

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.pipeline import Pipeline

from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot

# get the dataset

def get_dataset():

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

return X, y

# get a list of models to evaluate

def get_models():

models = dict()

for i in range(1,21):

steps = [('pca', PCA(n_components=i)), ('m', LogisticRegression())]

models[str(i)] = Pipeline(steps=steps)

return models

# evaluate a given model using cross-validation

def evaluate_model(model, X, y):

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

return scores

# define dataset

X, y = get_dataset()

# get the models to evaluate

models = get_models()

# evaluate the models and store results

results, names = list(), list()

for name, model in models.items():

scores = evaluate_model(model, X, y)

results.append(scores)

names.append(name)

print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

# plot model performance for comparison

pyplot.boxplot(results, labels=names, showmeans=True)

pyplot.xticks(rotation=45)

pyplot.show()

Output:

>1 0.542 (0.048)

>2 0.713 (0.048)

>3 0.720 (0.053)

>4 0.723 (0.051)

>5 0.725 (0.052)

>6 0.730 (0.046)

>7 0.805 (0.036)

>8 0.800 (0.037)

>9 0.814 (0.036)

>10 0.816 (0.034)

>11 0.819 (0.035)

>12 0.819 (0.038)

>13 0.819 (0.035)

>14 0.853 (0.029)

>15 0.865 (0.027)

>16 0.865 (0.027)

>17 0.865 (0.027)

>18 0.865 (0.027)

>19 0.865 (0.027)

>20 0.865 (0.027)

RESULT:



EX NO: 8

K-NEAREST NEIGHBOUR ALGORITHM

DATE:

AIM:

To write a program to implement k-Nearest Neighbour algorithm to classify the iris data set. Print both correct and wrong predictions. Java/Python ML library classes can be used for this problem.

Source Code:

import csv import random import math import operator

def loadDataset(filename, split, trainingSet=[] , testSet=[]): with open(filename, 'rb') as csvfile:

lines = csv.reader(csvfile) dataset = list(lines)

for x in range(len(dataset)-1): for y in range(4):

dataset[x][y] = float(dataset[x][y]) if random.random() < split:

trainingSet.append(dataset[x]) else:

testSet.append(dataset[x])

def euclideanDistance(instance1, instance2, length): distance = 0

for x in range(length):

distance += pow((instance1[x] - instance2[x]), 2) return math.sqrt(distance)

def getNeighbors(trainingSet, testInstance, k): distances = []

length = len(testInstance)-1

for x in range(len(trainingSet)):

dist = euclideanDistance(testInstance, trainingSet[x], length) distances.append((trainingSet[x], dist))

distances.sort(key=operator.itemgetter(1)) neighbors = []

for x in range(k):

neighbors.append(distances[x][0]) return neighbors

def

getResponse(neighbors): classVotes = {}

for x in range(len(neighbors)): response = neighbors[x][-1] if response in classVotes:

classVotes[response] += 1

else:

classVotes[response] = 1

sortedVotes =

sorted(classVotes.iteritems(),

reverse=True)

return sortedVotes[0][0]

def getAccuracy(testSet, predictions): correct = 0 for x in range(len(testSet)): key=operator.itemgetter(1

),

if testSet[x][-1] == predictions[x]: correct += 1

return (correct/float(len(testSet))) * 100.0

def main():

# prepare data trainingSet= [] testSet=[] split = 0.67

loadDataset('knndat.data', split, trainingSet, testSet) print('Train set: ' + repr(len(trainingSet))) print('Test set: ' + repr(len(testSet)))

# generate predictions predictions=[] k=3

for x in range(len(testSet)):

neighbors = getNeighbors(trainingSet, testSet[x],

k) result = getResponse(neighbors) predictions.append(result)

print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][- 1])) accuracy = getAccuracy(testSet, predictions)

print('Accuracy: ' + repr(accuracy) + '%') main()


OUTPUT

Confusion matrix is as follows

[[11 0 0]

[0 9 1]

[0 1 8]]

Accuracy metrics 0 1.00 1.00 1.00 11

1 0.90 0.90 0.90 10

2 0.89 0.89 0,89 9

Avg/Total 0.93 0.93 0.93 30

RESULT:


EX NO: 8

K-NEAREST NEIGHBOUR ALGORITHM

DATE:

AIM:

To implement the non-parametric Locally Weighted Regression algorithm in order to fit data points. Select appropriate data set for your experiment and draw graphs.

Source Code:

from numpy import * import operator

from os import listdir import matplotlib

import matplotlib.pyplot as plt import pandas as pd

import numpy as np1 import numpy.linalg as np

from scipy.stats.stats import pearsonr

def kernel(point,xmat, k): m,n = np1.shape(xmat)

weights = np1.mat(np1.eye((m))) for j in range(m):

diff = point - X[j]

weights[j,j] = np1.exp(diff*diff.T/(-2.0*k**2)) return weights

def localWeight(point,xmat,ymat,k): wei = kernel(point,xmat,k)

W=(X.T*(wei*X)).I*(X.T*(wei*ymat.T)) return W

def localWeightRegression(xmat,ymat,k): m,n = np1.shape(xmat)

ypred = np1.zeros(m) for i in range(m):

ypred[i] = xmat[i]*localWeight(xmat[i],xmat,ymat,k) return ypred

# load data points

data = pd.read_csv('data10.csv') bill = np1.array(data.total_bill) tip = np1.array(data.tip)

#preparing and add 1 in bill mbill = np1.mat(bill)

mtip = np1.mat(tip)

m= np1.shape(mbill)[1]

one = np1.mat(np1.ones(m))

X= np1.hstack((one.T,mbill.T))

#set k here

ypred = localWeightRegression(X,mtip,2)

SortIndex = X[:,1].argsort(0) xsort = X[SortIndex][:,0]

Output

No comments:

Post a Comment

IV CSE 3 ML LAB 2021

EX NO: 1 DECISION TREE BASED ID3 ALGORITHM DATE: AIM: To write a program to demonstrate the working of the decision tree based ID3 algorithm...