From 2fe87db4b7f2ddef8edb2d084ff7e4d79ebee864 Mon Sep 17 00:00:00 2001 From: Siddhi Jha <35024750+siddhi523@users.noreply.github.com> Date: Sat, 18 Apr 2020 01:26:40 +0530 Subject: [PATCH] Update email_preprocess.py Fixed Cross_Validation import error, cPickle import error and byte_object required but 'str' found errorand few syntax errors --- tools/email_preprocess.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index 2528b995904..41663510c0e 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -1,10 +1,9 @@ #!/usr/bin/python import pickle -import cPickle import numpy -from sklearn import cross_validation +from sklearn import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectPercentile, f_classif @@ -29,17 +28,17 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project - authors_file_handler = open(authors_file, "r") + authors_file_handler = open(authors_file, "rb") authors = pickle.load(authors_file_handler) authors_file_handler.close() - words_file_handler = open(words_file, "r") - word_data = cPickle.load(words_file_handler) + words_file_handler = open(words_file, "rb") + word_data = pickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) - features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) + features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42) @@ -59,7 +58,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data - print "no. of Chris training emails:", sum(labels_train) - print "no. of Sara training emails:", len(labels_train)-sum(labels_train) + print("no. of Chris training emails:", sum(labels_train)) + print("no. of Sara training emails:", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test