From 2fe87db4b7f2ddef8edb2d084ff7e4d79ebee864 Mon Sep 17 00:00:00 2001
From: Siddhi Jha <35024750+siddhi523@users.noreply.github.com>
Date: Sat, 18 Apr 2020 01:26:40 +0530
Subject: [PATCH] Update email_preprocess.py

Fixed Cross_Validation import error, cPickle import error and byte_object required but 'str' found errorand few syntax errors
---
 tools/email_preprocess.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py
index 2528b995904..41663510c0e 100644
--- a/tools/email_preprocess.py
+++ b/tools/email_preprocess.py
@@ -1,10 +1,9 @@
 #!/usr/bin/python
 
 import pickle
-import cPickle
 import numpy
 
-from sklearn import cross_validation
+from sklearn import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.feature_selection import SelectPercentile, f_classif
 
@@ -29,17 +28,17 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
 
     ### the words (features) and authors (labels), already largely preprocessed
     ### this preprocessing will be repeated in the text learning mini-project
-    authors_file_handler = open(authors_file, "r")
+    authors_file_handler = open(authors_file, "rb")
     authors = pickle.load(authors_file_handler)
     authors_file_handler.close()
 
-    words_file_handler = open(words_file, "r")
-    word_data = cPickle.load(words_file_handler)
+    words_file_handler = open(words_file, "rb")
+    word_data = pickle.load(words_file_handler)
     words_file_handler.close()
 
     ### test_size is the percentage of events assigned to the test set
     ### (remainder go into training)
-    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
+    features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42)
 
 
 
@@ -59,7 +58,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
     features_test_transformed  = selector.transform(features_test_transformed).toarray()
 
     ### info on the data
-    print "no. of Chris training emails:", sum(labels_train)
-    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
+    print("no. of Chris training emails:", sum(labels_train))
+    print("no. of Sara training emails:", len(labels_train)-sum(labels_train))
     
     return features_train_transformed, features_test_transformed, labels_train, labels_test