From 3d4de85430f4d9c7a25751b97a8af5fb835d5a4c Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 09:28:12 -0500 Subject: [PATCH 1/9] added decrypt_caesar_with_chi_squared.py and ran all checks --- ciphers/decrypt_caesar_with_chi_squared.py | 225 +++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 ciphers/decrypt_caesar_with_chi_squared.py diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py new file mode 100644 index 000000000000..acc4c7b60fbb --- /dev/null +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -0,0 +1,225 @@ +def decrypt_caesar_with_chi_squared( + ciphertext: str, + cipher_alphabet: list = [], + frequencies_dict: dict = {}, + case_sensetive: bool = False, +) -> list: + """ + Basic Usage + =========== + Arguments: + * ciphertext (str): the text to decode (encoded with the caesar cipher) + + Optional Arguments: + * cipher_alphabet (list): the alphabet used for the cipher (each letter is + a string separated by commas) + * frequencies_dict (dict): a dictionary of word frequencies where keys are + the letters and values are a percentage representation of the frequency as + a decimal/float + * case_sensetive (bool): a boolean value: True if the case matters during + decryption, False if it doesn't + + Returns: + * A list in the form of: + [ + most_likely_cipher, + most_likely_cipher_chi_squared_value, + decoded_most_likely_cipher + ] + + where... + - most_likely_cipher is an integer representing the shift of the smallest + chi-squared statistic (most likely key) + - most_likely_cipher_chi_squared_value is a float representing the + chi-squared statistic of the most likely shift + - decoded_most_likely_cipher is a string with the decoded cipher + (decoded by the most_likely_cipher key) + + + The Chi-squared test + ==================== + + The caesar cipher + ----------------- + The caesar cipher is a very insecure encryption algorithm, however it has + been used since Julius Caesar. The cipher is a simple substitution cipher + where each character in the plain text is replaced by a character in the + alphabet a certain number of characters after the original character. The + number of characters away is called the shift or key. For example: + + Plain text: hello + Key: 1 + Cipher text: ifmmp + (each letter in hello has been shifted one to the right in the eng. alphabet) + + As you can imagine, this doesn't provide lots of security. In fact + decrypting ciphertext by brute-force is extremely easy even by hand. However + one way to do that is the chi-squared test. + + The chi-squared test + ------------------- + Each letter in the english alphabet has a frequency, or the amount of times + it shows up compared to other letters (usually expressed as a decimal + representing the percentage likelyhood). The most common letter in the + english language is "e" with a frequency of 0.11162 or 11.162%. The test is + completed in the following fashion. + + 1. The ciphertext is decoded in a brute force way (every combination of the + 26 possible combinations) + 2. For every combination, for each letter in the combination, the average + amount of times the letter should appear the message is calculated by + multiplying the total number of characters by the frequency of the letter + + For example: + In a message of 100 characters, e should appear around 11.162 times. + + 3. Then, to calculate the margin of error (the amount of times the letter + SHOULD appear with the amount of times the letter DOES appear), we use + the chi-squared test. The following formula is used: + + Let: + - n be the number of times the letter actually appears + - p be the predicted value of the number of times the letter should + appear (see #2) + - let v be the chi-squared test result (reffered to here as chi-squared + value/statistic) + + (n - p)^2 + --------- = v + p + + 4. Each chi squared value for each letter is then added up to the total. + The total is the chi-squared statistic for that encryption key. + 5. The encryption key with the lowest chi-squared value is the most likely + to be the decoded answer. + + Further Reading + ================ + + * http://practicalcryptography.com/cryptanalysis/text-characterisation/chi-squared-statistic/ + * https://en.wikipedia.org/wiki/Letter_frequency + * https://en.wikipedia.org/wiki/Chi-squared_test + * https://en.m.wikipedia.org/wiki/Caesar_cipher + + Doctests + ======== + >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!') + [7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!'] + """ + if cipher_alphabet == []: + # get list of all leters in english alphabet + alphabet_letters = [chr(i) for i in range(97, 123)] + else: + # Set alphabet_letters to the custom alphabet + alphabet_letters = cipher_alphabet + + if frequencies_dict == {}: + # Frequencies of letters in the english language (how much they show up) + frequencies = { + "a": 0.08497, + "b": 0.01492, + "c": 0.02202, + "d": 0.04253, + "e": 0.11162, + "f": 0.02228, + "g": 0.02015, + "h": 0.06094, + "i": 0.07546, + "j": 0.00153, + "k": 0.01292, + "l": 0.04025, + "m": 0.02406, + "n": 0.06749, + "o": 0.07507, + "p": 0.01929, + "q": 0.00095, + "r": 0.07587, + "s": 0.06327, + "t": 0.09356, + "u": 0.02758, + "v": 0.00978, + "w": 0.02560, + "x": 0.00150, + "y": 0.01994, + "z": 0.00077, + } + else: + # Custom frequencies dictionary + frequencies = frequencies_dict + + if not case_sensetive: + ciphertext = ciphertext.lower() + + # Chi squared statistic values + chi_squared_statistic_values = {} + + # cycle through all of the shifts + for shift in range(len(alphabet_letters)): + decrypted_with_shift = "" + + # decrypt the message with the shift + for letter in ciphertext: + try: + # Try to index the letter in the alphabet + new_key = (alphabet_letters.index(letter) - shift) % len( + alphabet_letters + ) + decrypted_with_shift += alphabet_letters[new_key] + except ValueError: + # Append the character if it isn't in the alphabet + decrypted_with_shift += letter + + chi_squared_statistic = 0 + + # Loop through each letter in the decoded message with the shift + for letter in decrypted_with_shift: + if case_sensetive: + if letter in frequencies: + # Get the amount of times the letter occurs in the message + occurences = decrypted_with_shift.count(letter) + + # Get the excepcted amount of times the letter should appear based on letter frequencies + expected = frequencies[letter] * occurences + + # Complete the chi squared statistic formula + chi_letter_value = ((occurences - expected) ** 2) / expected + + # Add the margin of error to the total chi squared statistic + chi_squared_statistic += chi_letter_value + else: + if letter.lower() in frequencies: + # Get the amount of times the letter occurs in the message + occurences = decrypted_with_shift.count(letter) + + # Get the excepcted amount of times the letter should appear based on letter frequencies + expected = frequencies[letter] * occurences + + # Complete the chi squared statistic formula + chi_letter_value = ((occurences - expected) ** 2) / expected + + # Add the margin of error to the total chi squared statistic + chi_squared_statistic += chi_letter_value + + # Add the data to the chi_squared_statistic_values dictionary + chi_squared_statistic_values[shift] = [ + chi_squared_statistic, + decrypted_with_shift, + ] + + # Get the most likely cipher by finding the cipher with the smallest chi squared statistic + most_likely_cipher = min( + chi_squared_statistic_values, key=chi_squared_statistic_values.get + ) + + # Get all the data from the most likely cipher (key, decoded message) + most_likely_cipher_chi_squared_value = chi_squared_statistic_values[ + most_likely_cipher + ][0] + decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1] + + # Return the data on the most likely shift + return [ + most_likely_cipher, + most_likely_cipher_chi_squared_value, + decoded_most_likely_cipher, + ] From 5547591fb484041fb3dd93df103e0546710a4360 Mon Sep 17 00:00:00 2001 From: Maxim R <49735721+mrmaxguns@users.noreply.github.com> Date: Wed, 6 May 2020 10:50:34 -0500 Subject: [PATCH 2/9] Updated default parameters Removed mistake with mutable default arguments Co-authored-by: Christian Clauss --- ciphers/decrypt_caesar_with_chi_squared.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index acc4c7b60fbb..94f108157f0e 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -1,7 +1,7 @@ def decrypt_caesar_with_chi_squared( ciphertext: str, - cipher_alphabet: list = [], - frequencies_dict: dict = {}, + cipher_alphabet = None, + frequencies_dict = None, case_sensetive: bool = False, ) -> list: """ From 479c866427eec4b9c6ca3c9fcadd647cd83d0dcb Mon Sep 17 00:00:00 2001 From: Maxim R <49735721+mrmaxguns@users.noreply.github.com> Date: Wed, 6 May 2020 10:52:05 -0500 Subject: [PATCH 3/9] Updated handling for optional arguments Co-authored-by: Christian Clauss --- ciphers/decrypt_caesar_with_chi_squared.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 94f108157f0e..557f5c09ca0f 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -106,7 +106,10 @@ def decrypt_caesar_with_chi_squared( >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!') [7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!'] """ - if cipher_alphabet == []: + cipher_alphabet = cipher_alphabet or [] + frequencies_dict = frequencies_dict or {} + + if not cipher_alphabet: # get list of all leters in english alphabet alphabet_letters = [chr(i) for i in range(97, 123)] else: From 0850d89ba10b83dcd572271d85226c3e0404ff0e Mon Sep 17 00:00:00 2001 From: Maxim R <49735721+mrmaxguns@users.noreply.github.com> Date: Wed, 6 May 2020 10:58:56 -0500 Subject: [PATCH 4/9] Changed return statement to tuple Made function return a tuple instead of a list --- ciphers/decrypt_caesar_with_chi_squared.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 557f5c09ca0f..9eeaaf9cba30 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -20,12 +20,12 @@ def decrypt_caesar_with_chi_squared( decryption, False if it doesn't Returns: - * A list in the form of: - [ + * A tuple in the form of: + ( most_likely_cipher, most_likely_cipher_chi_squared_value, decoded_most_likely_cipher - ] + ) where... - most_likely_cipher is an integer representing the shift of the smallest @@ -221,8 +221,8 @@ def decrypt_caesar_with_chi_squared( decoded_most_likely_cipher = chi_squared_statistic_values[most_likely_cipher][1] # Return the data on the most likely shift - return [ + return ( most_likely_cipher, most_likely_cipher_chi_squared_value, decoded_most_likely_cipher, - ] + ) From bb587af13b07c24c331b73d9205d8599c3eb650a Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 11:10:08 -0500 Subject: [PATCH 5/9] Added more doctests --- ciphers/decrypt_caesar_with_chi_squared.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 9eeaaf9cba30..0030cdce9cd9 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -104,7 +104,14 @@ def decrypt_caesar_with_chi_squared( Doctests ======== >>> decrypt_caesar_with_chi_squared('dof pz aol jhlzhy jpwoly zv wvwbshy? pa pz avv lhzf av jyhjr!') - [7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!'] + (7, 3129.228005747531, 'why is the caesar cipher so popular? it is too easy to crack!') + + >>> decrypt_caesar_with_chi_squared('crybd cdbsxq') + (10, 233.35343938980898, 'short string') + + >>> decrypt_caesar_with_chi_squared(12) + Traceback (most recent call last): + AttributeError: 'int' object has no attribute 'lower' """ cipher_alphabet = cipher_alphabet or [] frequencies_dict = frequencies_dict or {} From 915c01649ecd864019fa9d042b3402f4ea7a1091 Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 11:11:51 -0500 Subject: [PATCH 6/9] Fixed spelling mistakes --- ciphers/decrypt_caesar_with_chi_squared.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 0030cdce9cd9..10b90e1abb48 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -60,7 +60,7 @@ def decrypt_caesar_with_chi_squared( ------------------- Each letter in the english alphabet has a frequency, or the amount of times it shows up compared to other letters (usually expressed as a decimal - representing the percentage likelyhood). The most common letter in the + representing the percentage likelihood). The most common letter in the english language is "e" with a frequency of 0.11162 or 11.162%. The test is completed in the following fashion. @@ -81,7 +81,7 @@ def decrypt_caesar_with_chi_squared( - n be the number of times the letter actually appears - p be the predicted value of the number of times the letter should appear (see #2) - - let v be the chi-squared test result (reffered to here as chi-squared + - let v be the chi-squared test result (referred to here as chi-squared value/statistic) (n - p)^2 @@ -186,26 +186,26 @@ def decrypt_caesar_with_chi_squared( if case_sensetive: if letter in frequencies: # Get the amount of times the letter occurs in the message - occurences = decrypted_with_shift.count(letter) + occurrences = decrypted_with_shift.count(letter) # Get the excepcted amount of times the letter should appear based on letter frequencies - expected = frequencies[letter] * occurences + expected = frequencies[letter] * occurrences # Complete the chi squared statistic formula - chi_letter_value = ((occurences - expected) ** 2) / expected + chi_letter_value = ((occurrences - expected) ** 2) / expected # Add the margin of error to the total chi squared statistic chi_squared_statistic += chi_letter_value else: if letter.lower() in frequencies: # Get the amount of times the letter occurs in the message - occurences = decrypted_with_shift.count(letter) + occurrences = decrypted_with_shift.count(letter) # Get the excepcted amount of times the letter should appear based on letter frequencies - expected = frequencies[letter] * occurences + expected = frequencies[letter] * occurrences # Complete the chi squared statistic formula - chi_letter_value = ((occurences - expected) ** 2) / expected + chi_letter_value = ((occurrences - expected) ** 2) / expected # Add the margin of error to the total chi squared statistic chi_squared_statistic += chi_letter_value From 02accd6c73538968bc6f24f0a4a1ab7bca2ab2db Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 11:22:21 -0500 Subject: [PATCH 7/9] black . - reformatted decrypt_caesar_with_chi_squared.py --- ciphers/decrypt_caesar_with_chi_squared.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 10b90e1abb48..cbecb9b72398 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -1,7 +1,7 @@ def decrypt_caesar_with_chi_squared( ciphertext: str, - cipher_alphabet = None, - frequencies_dict = None, + cipher_alphabet=None, + frequencies_dict=None, case_sensetive: bool = False, ) -> list: """ From 177ac0dcbba7e02732802af8a193c774d72af389 Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 11:41:58 -0500 Subject: [PATCH 8/9] Updated if statements to fit the updated code --- ciphers/decrypt_caesar_with_chi_squared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index cbecb9b72398..04ec2a112940 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -116,7 +116,7 @@ def decrypt_caesar_with_chi_squared( cipher_alphabet = cipher_alphabet or [] frequencies_dict = frequencies_dict or {} - if not cipher_alphabet: + if cipher_alphabet == []: # get list of all leters in english alphabet alphabet_letters = [chr(i) for i in range(97, 123)] else: From a056cebef0a7eacc039a87a0484a7df9eb45c987 Mon Sep 17 00:00:00 2001 From: Maxim Rebguns Date: Wed, 6 May 2020 11:47:17 -0500 Subject: [PATCH 9/9] Minimized amount of lines in the code. --- ciphers/decrypt_caesar_with_chi_squared.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ciphers/decrypt_caesar_with_chi_squared.py b/ciphers/decrypt_caesar_with_chi_squared.py index 04ec2a112940..3c37631c7b35 100644 --- a/ciphers/decrypt_caesar_with_chi_squared.py +++ b/ciphers/decrypt_caesar_with_chi_squared.py @@ -113,16 +113,9 @@ def decrypt_caesar_with_chi_squared( Traceback (most recent call last): AttributeError: 'int' object has no attribute 'lower' """ - cipher_alphabet = cipher_alphabet or [] + alphabet_letters = cipher_alphabet or [chr(i) for i in range(97, 123)] frequencies_dict = frequencies_dict or {} - if cipher_alphabet == []: - # get list of all leters in english alphabet - alphabet_letters = [chr(i) for i in range(97, 123)] - else: - # Set alphabet_letters to the custom alphabet - alphabet_letters = cipher_alphabet - if frequencies_dict == {}: # Frequencies of letters in the english language (how much they show up) frequencies = {