0% found this document useful (0 votes)
4 views

project code

The document outlines a Python script for predicting computer virus presence using machine learning techniques. It involves data preprocessing, model training with a Random Forest Classifier, and generating predictions on a test dataset. The script also includes data visualization and evaluation metrics for model performance.

Uploaded by

sirajknl
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

project code

The document outlines a Python script for predicting computer virus presence using machine learning techniques. It involves data preprocessing, model training with a Random Forest Classifier, and generating predictions on a test dataset. The script also includes data visualization and evaluation metrics for model performance.

Uploaded by

sirajknl
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

# This Python 3 environment comes with many helpful analytics libraries installed

# It is defined by the kaggle/python Docker image:


https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra


import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory


# For example, running this (by clicking run or pressing Shift+Enter) will list all
files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets
preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved
outside of the current session

/kaggle/input/computer-virus-predictor/sample_submission.csv
/kaggle/input/computer-virus-predictor/train.csv
/kaggle/input/computer-virus-predictor/test.csv

train_df = pd.read_csv('/kaggle/input/computer-virus-predictor/train.csv')
train_1_df = train_df.drop(columns = ['MachineID','ProductName','EngineVersion',
'AppVersion','SignatureVersion', 'IsBetaUser', 'PlatformType','Processor',
'OSVersion','OsPlatformSubRelease','OSBuildLab',
'SKUEditionName','MDC2FormFactor','DeviceFamily','PrimaryDiskType','ChassisType','P
owerPlatformRole','NumericOSVersion','OSArchitecture','OSBranch','OSEdition','OSSku
FriendlyName','OSInstallType','AutoUpdateOptionsName','OSGenuineState','LicenseActi
vationChannel','FlightRing','DateAS','DateOS'])
train_1_df["id"] = train_1_df.index
train_1_df.head()
train_1_df.tail()

train_1_df.dtypes

import matplotlib.pyplot as plt


import seaborn as sns
sns.heatmap(train_1_df.corr(numeric_only=True))

from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split

train_1_df = train_1_df[train_1_df['RealTimeProtectionState'].notna()]
train_1_df = train_1_df[train_1_df['AntivirusConfigID'].notna()]
train_1_df = train_1_df[train_1_df['CityID'].notna()]
train_1_df = train_1_df[train_1_df['SMode'].notna()]
train_1_df = train_1_df[train_1_df['IEVersionID'].notna()]
train_1_df = train_1_df[train_1_df['ProcessorCoreCount'].notna()]
train_1_df = train_1_df[train_1_df['PrimaryDisplayDiagonalInches'].notna()]

train_1_df = train_1_df[train_1_df['FirewallEnabled'].notna()]
train_1_df = train_1_df[train_1_df['TotalPhysicalRAMMB'].notna()]
train_1_df = train_1_df[train_1_df['EnableLUA'].notna()]
train_1_df = train_1_df[train_1_df['OEMModelID'].notna()]
train_1_df = train_1_df[train_1_df['InternalBatteryNumberOfCharges'].notna()]
train_1_df = train_1_df[train_1_df['IsGamer'].notna()]
train_1_df = train_1_df[train_1_df['OSInstallLanguageID'].notna()]
train_1_df = train_1_df[train_1_df['IsFlightsDisabled'].notna()]
train_1_df = train_1_df[train_1_df['FirmwareManufacturerID'].notna()]
train_1_df = train_1_df[train_1_df['IsVirtualDevice'].notna()]

X_train = train_1_df.drop(columns = ['target'])


Y_train = train_1_df['target']
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train,
test_size=0.3)
train_1_df.isna().sum()

#Create the model


from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, Y_train)
prediction = model.predict(x_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

test_df = pd.read_csv('/kaggle/input/computer-virus-predictor/test.csv')
x_test = test_df.drop(columns = ['MachineID','ProductName','EngineVersion',
'AppVersion','SignatureVersion', 'IsBetaUser', 'PlatformType','Processor',
'OSVersion','OsPlatformSubRelease','OSBuildLab',
'SKUEditionName','MDC2FormFactor','DeviceFamily','PrimaryDiskType','ChassisType','P
owerPlatformRole','NumericOSVersion','OSArchitecture','OSBranch','OSEdition','OSSku
FriendlyName','OSInstallType','AutoUpdateOptionsName','OSGenuineState','LicenseActi
vationChannel','FlightRing','DateAS','DateOS'])
x_test["id"] = x_test.index
x_test = x_test[x_test['RealTimeProtectionState'].notna()]
x_test = x_test[x_test['AntivirusConfigID'].notna()]
x_test = x_test[x_test['CityID'].notna()]
x_test = x_test[x_test['SMode'].notna()]
x_test = x_test[x_test['IEVersionID'].notna()]
x_test = x_test[x_test['ProcessorCoreCount'].notna()]
x_test = x_test[x_test['PrimaryDisplayDiagonalInches'].notna()]
x_test = x_test[x_test['FirewallEnabled'].notna()]
x_test = x_test[x_test['TotalPhysicalRAMMB'].notna()]
x_test = x_test[x_test['EnableLUA'].notna()]
x_test = x_test[x_test['OEMModelID'].notna()]
x_test = x_test[x_test['InternalBatteryNumberOfCharges'].notna()]
x_test = x_test[x_test['IsGamer'].notna()]
x_test = x_test[x_test['OSInstallLanguageID'].notna()]
x_test = x_test[x_test['IsFlightsDisabled'].notna()]
x_test = x_test[x_test['FirmwareManufacturerID'].notna()]
x_test = x_test[x_test['IsVirtualDevice'].notna()]

prediction = model.predict(x_test)

final_prediction = pd.DataFrame({'id': x_test.id, 'target': prediction})


final_prediction.to_csv('submission.csv', index=False)

You might also like