From b84cff61d8bba56694e4cae00d3e8581ae655036 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:30:04 +0530 Subject: [PATCH 01/14] Refer ChangeLog: Lecture 01 --- tools/email_preprocess.py | 29 +++++++++------------- tools/startup.py | 52 +++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 43 deletions(-) diff --git a/tools/email_preprocess.py b/tools/email_preprocess.py index 2528b995904..fd83cea3e9f 100644 --- a/tools/email_preprocess.py +++ b/tools/email_preprocess.py @@ -1,15 +1,13 @@ -#!/usr/bin/python +#!/usr/bin/python3 -import pickle -import cPickle +import joblib import numpy -from sklearn import cross_validation +from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_selection import SelectPercentile, f_classif - def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) @@ -29,23 +27,20 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project - authors_file_handler = open(authors_file, "r") - authors = pickle.load(authors_file_handler) - authors_file_handler.close() + authors_file_handler = open(authors_file, "rb") + authors = joblib.load(authors_file_handler) + - words_file_handler = open(words_file, "r") - word_data = cPickle.load(words_file_handler) - words_file_handler.close() + words_file_handler = open(words_file, "rb") + word_data = joblib.load(words_file_handler) ### test_size is the percentage of events assigned to the test set ### (remainder go into training) - features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) - + features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers - vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, - stop_words='english') + vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) @@ -59,7 +54,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data - print "no. of Chris training emails:", sum(labels_train) - print "no. of Sara training emails:", len(labels_train)-sum(labels_train) + print("No. of Chris training emails : ", sum(labels_train)) + print("No. of Sara training emails : ", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test diff --git a/tools/startup.py b/tools/startup.py index 4638e0d115e..b87c865f37c 100644 --- a/tools/startup.py +++ b/tools/startup.py @@ -1,47 +1,47 @@ -#!/usr/bin/python +#!/usr/bin/python3 -print -print "checking for nltk" +print("Checking for nltk") try: import nltk except ImportError: - print "you should install nltk before continuing" + print("You should install nltk before continuing") -print "checking for numpy" +print("Checking for numpy") try: import numpy except ImportError: - print "you should install numpy before continuing" + print("You should install numpy before continuing") -print "checking for scipy" +print("Checking for scipy") try: import scipy except: - print "you should install scipy before continuing" + print("You should install scipy before continuing") -print "checking for sklearn" +print("Checking for sklearn") try: import sklearn except: - print "you should install sklearn before continuing" - -print -print "downloading the Enron dataset (this may take a while)" -print "to check on progress, you can cd up one level, then execute " -print "Enron dataset should be last item on the list, along with its current size" -print "download will complete at about 423 MB" -import urllib -url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz" -urllib.urlretrieve(url, filename="../enron_mail_20150507.tar.gz") -print "download complete!" + print("You should install sklearn before continuing") + +print("Downloading the Enron dataset (this may take a while)") +print("To check on progress, you can cd up one level, then execute ") +print("Enron dataset should be last item on the list, along with its current size") +print("Download will complete at about 1.82 GB") +import requests +url = "https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tar.gz" +filename = "../enron_mail_20150507.tar.gz" +with open(filename, "wb") as f: + r = requests.get(url) + f.write(r.content) +print("Download Complete!") -print -print "unzipping Enron dataset (this may take a while)" +print("Unzipping Enron dataset (This may take a while)") import tarfile -import os -os.chdir("..") -tfile = tarfile.open("enron_mail_20150507.tar.gz", "r:gz") +tfile = tarfile.open("../enron_mail_20150507.tar.gz") tfile.extractall(".") +tfile.close() + +print("You're ready to go!") -print "you're ready to go!" From 898c0eb9c66aea21689659b8941d19e7485a61f4 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:30:42 +0530 Subject: [PATCH 02/14] Refer ChangeLog: Lecture 02 --- naive_bayes/nb_author_id.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/naive_bayes/nb_author_id.py b/naive_bayes/nb_author_id.py index f69d57d8408..102da2ec13f 100644 --- a/naive_bayes/nb_author_id.py +++ b/naive_bayes/nb_author_id.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ This is the code to accompany the Lesson 1 (Naive Bayes) mini-project. @@ -22,12 +22,26 @@ features_train, features_test, labels_train, labels_test = preprocess() +############################################################## +# Enter Your Code Here -######################################################### -### your code goes here ### +############################################################## -######################################################### +############################################################## +''' +You Will be Required to record time for Training and Predicting +The Code Given on Udacity Website is in Python-2 +The Following Code is Python-3 version of the same code +''' +# t0 = time() +# # < your clf.fit() line of code > +# print("Training Time:", round(time()-t0, 3), "s") +# t0 = time() +# # < your clf.predict() line of code > +# print("Predicting Time:", round(time()-t0, 3), "s") + +############################################################## \ No newline at end of file From c71cd2c8a59ee6b39c06b844d17cdf1ae1dc92a3 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:31:15 +0530 Subject: [PATCH 03/14] Refer ChangeLog: Lecture 03 --- svm/svm_author_id.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/svm/svm_author_id.py b/svm/svm_author_id.py index fda3f7fdb28..35390d60ad6 100644 --- a/svm/svm_author_id.py +++ b/svm/svm_author_id.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ This is the code to accompany the Lesson 2 (SVM) mini-project. @@ -20,11 +20,20 @@ features_train, features_test, labels_train, labels_test = preprocess() +######################################################### +### your code goes here ### ######################################################### -### your code goes here ### ######################################################### +''' +You'll be Provided similar code in the Quiz +But the Code provided in Quiz has an Indexing issue +The Code Below solves that issue, So use this one +''' +# features_train = features_train[:int(len(features_train)/100)] +# labels_train = labels_train[:int(len(labels_train)/100)] +######################################################### From 7602cdfd8d421817d32e0ff885fb444a89cdab6e Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:32:05 +0530 Subject: [PATCH 04/14] Refer ChangeLog: Lecture 06 --- datasets_questions/explore_enron_data.py | 7 +++---- tools/feature_format.py | 14 +++++++------- 2 files changed, 10 insertions(+), 11 deletions(-) mode change 100644 => 100755 datasets_questions/explore_enron_data.py diff --git a/datasets_questions/explore_enron_data.py b/datasets_questions/explore_enron_data.py old mode 100644 new mode 100755 index ca9bacb9c83..853767a5e09 --- a/datasets_questions/explore_enron_data.py +++ b/datasets_questions/explore_enron_data.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ Starter code for exploring the Enron dataset (emails + finances); @@ -15,8 +15,7 @@ """ -import pickle - -enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "rb")) +import joblib +enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb")) diff --git a/tools/feature_format.py b/tools/feature_format.py index 7ca78ac291a..a96178c73df 100644 --- a/tools/feature_format.py +++ b/tools/feature_format.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ A general tool for converting data from the @@ -54,8 +54,8 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True # Key order - first branch is for Python 3 compatibility on mini-projects, # second branch is for compatibility on final project. if isinstance(sort_keys, str): - import pickle - keys = pickle.load(open(sort_keys, "rb")) + import joblib + keys = joblib.load(open(sort_keys, "rb")) elif sort_keys: keys = sorted(dictionary.keys()) else: @@ -67,10 +67,10 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True try: dictionary[key][feature] except KeyError: - print "error: key ", feature, " not present" + print("Error: Key ", feature, " Not Present") return value = dictionary[key][feature] - if value=="NaN" and remove_NaN: + if value == 'NaN' and remove_NaN: value = 0 tmp_list.append( float(value) ) @@ -86,14 +86,14 @@ def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True if remove_all_zeroes: append = False for item in test_list: - if item != 0 and item != "NaN": + if item != 0 and item != 'NaN': append = True break ### if any features for a given data point are zero ### and you want to remove data points with any zeroes, ### handle that here if remove_any_zeroes: - if 0 in test_list or "NaN" in test_list: + if 0 in test_list or 'NaN' in test_list: append = False ### Append the data point if flagged for addition. if append: From 9a14c84df0e9aa03e4171c5797861dd154d2cd4b Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:32:36 +0530 Subject: [PATCH 05/14] Refer ChangeLog: Lecture 07 --- regression/finance_regression.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/regression/finance_regression.py b/regression/finance_regression.py index efa10637a1f..27809819c1d 100644 --- a/regression/finance_regression.py +++ b/regression/finance_regression.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ Starter code for the regression mini-project. @@ -14,19 +14,20 @@ import sys -import pickle +import joblib sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit -dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) +dictionary = joblib.load( open("../final_project/final_project_dataset_modified.pkl", "rb") ) + ### list the features you want to look at--first item in the ### list will be the "target" feature features_list = ["bonus", "salary"] -data = featureFormat( dictionary, features_list, remove_any_zeroes=True) +data = featureFormat( dictionary, features_list, remove_any_zeroes=True, sort_keys = '../tools/python2_lesson06_keys.pkl') target, features = targetFeatureSplit( data ) ### training-testing split needed in regression, just like classification -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "b" From afc15672d99d2a17a084378f97103843b8601170 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:33:18 +0530 Subject: [PATCH 06/14] Refer ChangeLog: Lecture 08 --- outliers/enron_outliers.py | 6 +++--- outliers/outlier_removal_regression.py | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/outliers/enron_outliers.py b/outliers/enron_outliers.py index ac26d7fe9a8..d1e3d6d7319 100644 --- a/outliers/enron_outliers.py +++ b/outliers/enron_outliers.py @@ -1,6 +1,6 @@ -#!/usr/bin/python +#!/usr/bin/python3 -import pickle +import joblib import sys import matplotlib.pyplot sys.path.append("../tools/") @@ -8,7 +8,7 @@ ### read in data dictionary, convert to numpy array -data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) +data_dict = joblib.load( open("../final_project/final_project_dataset.pkl", "rb") ) features = ["salary", "bonus"] data = featureFormat(data_dict, features) diff --git a/outliers/outlier_removal_regression.py b/outliers/outlier_removal_regression.py index d509cd9f22f..0431f01cc26 100644 --- a/outliers/outlier_removal_regression.py +++ b/outliers/outlier_removal_regression.py @@ -1,16 +1,16 @@ -#!/usr/bin/python +#!/usr/bin/python3 import random import numpy import matplotlib.pyplot as plt -import pickle +import joblib from outlier_cleaner import outlierCleaner ### load up some practice data with outliers in it -ages = pickle.load( open("practice_outliers_ages.pkl", "r") ) -net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") ) +ages = joblib.load( open("practice_outliers_ages.pkl", "rb") ) +net_worths = joblib.load( open("practice_outliers_net_worths.pkl", "rb") ) @@ -20,7 +20,7 @@ ### and n_columns is the number of features ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) ### fill in a regression here! Name the regression object reg so that @@ -35,7 +35,6 @@ - try: plt.plot(ages, reg.predict(ages), color="blue") except NameError: @@ -50,8 +49,8 @@ predictions = reg.predict(ages_train) cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) except NameError: - print "your regression object doesn't exist, or isn't name reg" - print "can't make predictions to use in identifying outliers" + print("Your regression object doesn't exist, or isn't name reg") + print("Can't make predictions to use in identifying outliers") @@ -70,9 +69,9 @@ reg.fit(ages, net_worths) plt.plot(ages, reg.predict(ages), color="blue") except NameError: - print "you don't seem to have regression imported/created," - print " or else your regression object isn't named reg" - print " either way, only draw the scatter plot of the cleaned data" + print("You don't seem to have regression imported/created,") + print(" or else your regression object isn't named reg") + print(" either way, only draw the scatter plot of the cleaned data") plt.scatter(ages, net_worths) plt.xlabel("ages") plt.ylabel("net worths") @@ -80,5 +79,6 @@ else: - print "outlierCleaner() is returning an empty list, no refitting to be done" + print("outlierCleaner() is returning an empty list, no refitting to be done") + From 639f3bef29fc990edd5583ca7e88744fe38a7cf9 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:34:08 +0530 Subject: [PATCH 07/14] Refer ChangeLog: Lecture 09 --- k_means/k_means_cluster.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/k_means/k_means_cluster.py b/k_means/k_means_cluster.py index 6a2ba687017..b048580c1c7 100644 --- a/k_means/k_means_cluster.py +++ b/k_means/k_means_cluster.py @@ -1,13 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ Skeleton code for k-means clustering mini-project. """ - - -import pickle +import joblib import numpy import matplotlib.pyplot as plt import sys @@ -39,7 +37,7 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature ### load in the dict of dicts containing all the data on each person in the dataset -data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) +data_dict = joblib.load( open("../final_project/final_project_dataset.pkl", "rb") ) ### there's an outlier--remove it! data_dict.pop("TOTAL", 0) @@ -73,4 +71,4 @@ def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature try: Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) except NameError: - print "no predictions object named pred found, no clusters to plot" + print("No predictions object named pred found, no clusters to plot") From 3be0c63495605c198cdd2978364f7f08ecbbabe2 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:35:02 +0530 Subject: [PATCH 08/14] Refer ChangeLog: Lecture 11 --- text_learning/vectorize_text.py | 34 ++++++++++++++++----------------- tools/parse_out_email_text.py | 10 +++++----- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/text_learning/vectorize_text.py b/text_learning/vectorize_text.py index 629c6b0f317..f9b1da5b891 100644 --- a/text_learning/vectorize_text.py +++ b/text_learning/vectorize_text.py @@ -1,7 +1,7 @@ -#!/usr/bin/python +#!/usr/bin/python3 import os -import pickle +import joblib import re import sys @@ -43,33 +43,31 @@ ### once everything is working, remove this line to run over full dataset temp_counter += 1 if temp_counter < 200: - path = os.path.join('..', path[:-1]) - print path - email = open(path, "r") + path = os.path.join('..', path[:-1]) + print(path) + email = open(path, "r") - ### use parseOutText to extract the text from the opened email + ### use parseOutText to extract the text from the opened email - ### use str.replace() to remove any instances of the words - ### ["sara", "shackleton", "chris", "germani"] - ### append the text to word_data + ### use str.replace() to remove any instances of the words + ### ["sara", "shackleton", "chris", "germani"] - ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris + ### append the text to word_data - email.close() -print "emails processed" -from_sara.close() -from_chris.close() + ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris -pickle.dump( word_data, open("your_word_data.pkl", "w") ) -pickle.dump( from_data, open("your_email_authors.pkl", "w") ) + email.close() +print("Emails Processed") +from_sara.close() +from_chris.close() +joblib.dump( word_data, open("your_word_data.pkl", "wb") ) +joblib.dump( from_data, open("your_email_authors.pkl", "wb") ) ### in Part 4, do TfIdf vectorization here - - diff --git a/tools/parse_out_email_text.py b/tools/parse_out_email_text.py index 43725b22d10..2013fa5b31a 100644 --- a/tools/parse_out_email_text.py +++ b/tools/parse_out_email_text.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 from nltk.stem.snowball import SnowballStemmer import string @@ -25,16 +25,16 @@ def parseOutText(f): words = "" if len(content) > 1: ### remove punctuation - text_string = content[1].translate(string.maketrans("", ""), string.punctuation) + text_string = content[1].translate(str.maketrans('','',string.punctuation)) ### project part 2: comment out the line below words = text_string + + ### split the text string into individual words, stem each word, ### and append the stemmed word to words (make sure there's a single ### space between each stemmed word) - - @@ -45,7 +45,7 @@ def parseOutText(f): def main(): ff = open("../text_learning/test_email.txt", "r") text = parseOutText(ff) - print text + print(text) From 29c8a743fbc7b62f732d357b572161e11027a0d2 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:35:33 +0530 Subject: [PATCH 09/14] Refer ChangeLog: Lecture 12 --- feature_selection/find_signature.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/feature_selection/find_signature.py b/feature_selection/find_signature.py index c01a1f2111a..e60c06fd3d8 100644 --- a/feature_selection/find_signature.py +++ b/feature_selection/find_signature.py @@ -1,6 +1,6 @@ -#!/usr/bin/python +#!/usr/bin/python3 -import pickle +import joblib import numpy numpy.random.seed(42) @@ -10,8 +10,8 @@ ### mini-project. words_file = "../text_learning/your_word_data.pkl" authors_file = "../text_learning/your_email_authors.pkl" -word_data = pickle.load( open(words_file, "r")) -authors = pickle.load( open(authors_file, "r") ) +word_data = joblib.load( open(words_file, "r")) +authors = joblib.load( open(authors_file, "r") ) @@ -19,12 +19,11 @@ ### remainder go into training) ### feature matrices changed to dense representations for compatibility with ### classifier functions in versions 0.15.2 and earlier -from sklearn import cross_validation -features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) +from sklearn.model_selection import train_test_split +features_train, features_test, labels_train, labels_test = train_test_split(word_data, authors, test_size=0.1, random_state=42) from sklearn.feature_extraction.text import TfidfVectorizer -vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, - stop_words='english') +vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() From 624636afed2f63e1f4d5274eb1f802bc50a54a59 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:36:04 +0530 Subject: [PATCH 10/14] Refer ChangeLog: Lecture 13 --- pca/eigenfaces.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/pca/eigenfaces.py b/pca/eigenfaces.py index 074b860a253..9e12f91208b 100644 --- a/pca/eigenfaces.py +++ b/pca/eigenfaces.py @@ -23,7 +23,7 @@ import pylab as pl import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_lfw_people from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report @@ -53,11 +53,10 @@ target_names = lfw_people.target_names n_classes = target_names.shape[0] -print "Total dataset size:" -print "n_samples: %d" % n_samples -print "n_features: %d" % n_features -print "n_classes: %d" % n_classes - +print("Total dataset size:") +print("n_samples: %d" % n_samples) +print("n_features: %d" % n_features) +print("n_classes: %d" % n_classes)( ############################################################################### # Split into a training and testing set @@ -68,24 +67,23 @@ # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 -print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) +print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) -print "done in %0.3fs" % (time() - t0) +print("done in %0.3fs" % (time() - t0)) eigenfaces = pca.components_.reshape((n_components, h, w)) -print "Projecting the input data on the eigenfaces orthonormal basis" +print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) -print "done in %0.3fs" % (time() - t0) +print("done in %0.3fs" % (time() - t0)) ############################################################################### -# Train a SVM classification model - -print "Fitting the classifier to the training set" +# Train a SVM classification model( +print("Fitting the classifier to the training set") t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], @@ -94,21 +92,21 @@ # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) -print "done in %0.3fs" % (time() - t0) -print "Best estimator found by grid search:" -print clf.best_estimator_ +print("done in %0.3fs" % (time() - t0)) +print("Best estimator found by grid search:") +print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set -print "Predicting the people names on the testing set" +print("Predicting the people names on the testing set") t0 = time() y_pred = clf.predict(X_test_pca) -print "done in %0.3fs" % (time() - t0) +print("done in %0.3fs" % (time() - t0)) -print classification_report(y_test, y_pred, target_names=target_names) -print confusion_matrix(y_test, y_pred, labels=range(n_classes)) +print(classification_report(y_test, y_pred, target_names=target_names)) +print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) ############################################################################### From 4627d16f766f262b891dbabcc8e517155055b442 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:37:53 +0530 Subject: [PATCH 11/14] Initial Commit: .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 17c225de1fc..be94d4f00b8 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,5 @@ text_learning/your_email_authors.pkl my_classifier.pkl my_dataset.pkl my_feature_list.pkl +.DS_Store +__pycache__ \ No newline at end of file From 3d7e64cbfc61888621b3d48688d3632a125e0a04 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:38:04 +0530 Subject: [PATCH 12/14] Initial Commit: ReadMe --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 9c2c9967cb5..94a752739fe 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,7 @@ ud120-projects ============== Starter project code for students taking Udacity ud120 + +### Migrated to Python-3 by [Siddharth Kekre](https://github.com/iSiddharth20) + +### Please refer to the [Change Log](https://github.com/iSiddharth20/ud120-projects/blob/master/CHANGELOG.md) for Details. \ No newline at end of file From 104d2baf4c3f89640fc06f6c95ed3c960542256b Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:38:23 +0530 Subject: [PATCH 13/14] Initial Commit: Requirements.TXT --- requirements.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1d4ac04c20e..363c377950b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ -nltk==3.2.1 -numpy==1.11.2 -scikit-learn==0.18 -scipy==0.18.1 +nltk>=3.5 +numpy>=1.18.2 +scikit-learn>=0.22.2.post1 +scipy>=1.4.1 +joblib>=0.14.1 +requests>=2.23.0 +matplotlib>=3.2.1 \ No newline at end of file From 5bcfc70b6354d0278705369f904191d37538d226 Mon Sep 17 00:00:00 2001 From: Siddharth Kekre Date: Wed, 23 Sep 2020 22:38:54 +0530 Subject: [PATCH 14/14] Initial Commit: ChangeLog by iSiddharth --- CHANGELOG.md | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000000..0bf387407bd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,111 @@ +# Change Log + * All notable changes to this project will be documented in this file. + * Lecture wise File Specific changes are also mentioned. + + +## Common Changes to All Files +- Migrated all codes to Python-3. +- Updated Shabang to '#!/usr/bin/python3'. +- Updated Libraries to support Python 3.6 or higher. +- Added Python-3 version of Course Quiz questions. (In Some Cases). +- Updated 'readme.md' +- Updated 'requirements.txt'. +- Updated '.gitignore'. +- Added 'Changelog.md' + +## Lecture 1 : Intro to Machine Learning +### tools/startup.py +- Updated printing the file size from '423 MB' to '1.82 GB' as the Dataset has been Updated. +- Updated code for downloading 'enron_mail_20150507.tar.gz' using 'requests' instead of 'urllib'. +- Updated code for extracting 'enron_mail_20150507.tar.gz' for Python-3. + +### tools/email_preprocess.py +- 'joblib' used instead of 'Pickle' and 'cPickle'. +- 'model_selection.train_test_split' used instead of 'cross_validation.train_test_split'. + + +## Lecture 2 : Naive Bayes +### naive_bayes/nb_author_id.py +- Added Python-3 code to print Training and Predicting Time for Course Quiz. + + +## Lecture 3 : SVM +### svm/svm_author_id.py +- Added the Fix for Indexing Issue in slicing 1% of Training Data. + + +## Lecture 4 and Lecture 5 +- No Special Changes + + +## Lecture 6 : Dataset and Questions +### datasets_questions/explore_enron_data.py +- 'joblib' used instead of 'Pickle'. + +### tools/feature_format.py +- 'joblib' used instead of 'Pickle' + + +## Lecture 7 : Regression +### regression/finance_regression.py +- 'joblib' used instead of 'Pickle'. +- Updated reading mode of '.pkl' file from 'r' to 'rb' to resolve file reading issue. +- 'model_selection.train_test_split' used instead of 'cross_validation.train_test_split'. +- Added 'sort_keys' parameter to Line 26 as mentioned in the Course for Python-3 Compatibility. + + +## Lesson 8 : Outliers +### outliers/enron_outliers.py +- 'joblib' used instead of 'Pickle'. +- Updated reading mode of '.pkl' file from 'r' to 'rb' to resolve file reading issue. + + +### outliers/outlier_removal_regression.py +- 'joblib' used instead of 'Pickle'. +- Updated reading mode of '.pkl' file from 'r' to 'rb' to resolve file reading issue. +- 'model_selection.train_test_split' used instead of 'cross_validation.train_test_split'. + + +## Lesson 9 : Clustering +### k_mean/k_means_cluster.py +- 'joblib' used instead of 'Pickle'. +- Updated reading mode of '.pkl' file from 'r' to 'rb' to resolve file reading issue. + + +## Lecture 10 +- No Special Changes + + +## Lecture 11 : Text Learning +### tools/parse_out_email_text.py +- Updated 'string' syntax for Python-3 Compatibility. + +### text_learning/vectorize_text.py +- 'joblib' used instead of 'Pickle'. + + +## Lecture 12 : Feature Selection +### feature_selection/find_signature.py +- 'joblib' used instead of 'Pickle'. +- 'model_selection.train_test_split' used instead of 'cross_validation.train_test_split'. + + +## Lecture 13 : PCA +### pca/eigenfaces.py +- 'model_selection.train_test_split' used instead of 'cross_validation.train_test_split'. + + +## Lecture 14 : Validation +- No Special Changes + + +## Lecture 15 : Evaluation Metrics +- No Special Changes + + +## Lecture 16 : Trying it all Together +- No Special Changes + + +## Lecture 17 : Final Project +- No Special Changes \ No newline at end of file