Problem: http://www.cryptopals.com/sets/1/challenges/4
I've previously completed this problem in C but I wanted to do a more generalised solution in Go (I just stopped checking strings for englishness when I reached one that matched my arbitrary goal in C, now I want the MOST english of all).
My solution works fine for the challenge 3 string, but when I try with challenge 4 I just get garbage out. More disconcertingly, I can't even see the correct string in the collection of strings generated by my XORs (I printed them all to a file). I've tried changing how I extract the strings from the text file (which is why I'm using a less standard method of getting the strings out) with no effect.
Challenge 3's string is "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736" which should output "Cooking MC's like a pound of bacon" which functions correctly. Challenge 4 is a file with 300 odd lines, only one of which is the correct one to decode. It should decrypt to "Now that the party is jumping", but I just get "U+)Ex(unprintable)NSqhe/]PuSE7Nr;Rw;OUqeas". I've been able to get a couple of different outputs but never the correct one.
func main() {
filebytes, err := ioutil.ReadFile("4.txt")
if err != nil {
log.Fatal(err)
}
filestring := string(filebytes)
lines := strings.Split(filestring, "
")
bestGuess := challenge4.GuessFile(lines)
fmt.Println(bestGuess)
}
func GuessFile(lines []string) string {
guessArray := make([]string, len(lines))
for i, line := range lines {
bytes, err := hex.DecodeString(line)
if err != nil {
log.Fatal(err)
}
guessArray[i] = challenge3.GuessString(bytes)
}
return utilities.MostEnglish(guessArray)
}
func GuessString(b []byte) string {
guessArray := make([]string, 256)
for i := 0; i < 256; i++ {
guessArray[i] = string(utilities.SbXor(b, byte(i)))
}
return utilities.MostEnglish(guessArray)
}
// MostEnglish takes a slice of strings and returns the string most likely to
// be an English sentence.
func MostEnglish(s []string) string {
var maxVal uint64
var maxStr string
for _, line := range s {
val := EnglishFreq(line)
if val != 0 {
if val > maxVal {
maxVal, maxStr = val, line
}
}
}
return maxStr
}
// EnglishFreq takes a string and returns the value representing
// the likelihood the string is a valid English sentence based on word frequency
func EnglishFreq(s string) uint64 {
var total uint64
alphaFreq := map[rune]uint64{
'A': 816,
'B': 149,
'C': 278,
'D': 425,
'E': 1270,
'F': 222,
'G': 201,
'H': 609,
'I': 696,
'J': 15,
'K': 77,
'L': 402,
'M': 240,
'N': 674,
'O': 750,
'P': 192,
'Q': 9,
'R': 598,
'S': 632,
'T': 905,
'U': 275,
'V': 236,
'W': 20,
'X': 15,
'Y': 197,
'Z': 7,
}
for _, char := range s {
if !unicode.IsPrint(char) {
return 0
}
if val, ok := alphaFreq[unicode.ToUpper(char)]; ok {
total += val
}
}
return total
}
//SbXor does a single byte xor against a provided byte array
func SbXor(arr []byte, b byte) []byte {
for i := range arr {
arr[i] ^= b
}
return arr
}
Your score function EnglishFreq
is flawed. You ignore plaintext candidates that contain at least one non-printable character. However, there are some valid plaintexts that do contain one or more of those. For instance:
package main
import (
"testing"
"unicode"
)
func EnglishFreq(s string) uint64 {
var total uint64
alphaFreq := map[rune]uint64{
'A': 816, 'B': 149, 'C': 278, 'D': 425, 'E': 1270, 'F': 222, 'G': 201,
'H': 609, 'I': 696, 'J': 15, 'K': 77, 'L': 402, 'M': 240, 'N': 674,
'O': 750, 'P': 192, 'Q': 9, 'R': 598, 'S': 632, 'T': 905, 'U': 275,
'V': 236, 'W': 20, 'X': 15, 'Y': 197, 'Z': 7,
}
for _, char := range s {
if !unicode.IsPrint(char) {
return 0
}
if val, ok := alphaFreq[unicode.ToUpper(char)]; ok {
total += val
}
}
return total
}
func TestEnglishFreq(t *testing.T) {
if s := "foo
bar"; EnglishFreq(s) == 0 {
t.Errorf("EnglishFreq(%q) == 0, want > 0", s)
}
}
// $ go test
// --- FAIL: TestEnglishFreq (0.00s)
// main_test.go:31: EnglishFreq("foo
bar") == 0, want > 0
Don't discard potential plaintexts just because they contain a single unprintable character. Instead, reduce the score for each such character. This will be important in later challenges too.
I've had success with chi-squared testing, suggested in an answer on Cryptography Stack Exchange.