commit e0c76dbcfb924d9c5552326cf6f0029af191278b Author: Valentin Ochs Date: Sun Feb 2 20:23:00 2020 +0100 Initial commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0601ae2 --- /dev/null +++ b/Makefile @@ -0,0 +1,29 @@ +CXXFLAGS += -std=c++17 -pedantic -Wall +LDFLAGS=-lm -lstdc++ + +SRC=test.cpp +OBJ=$(SRC:%=%.o) +JSON=$(OBJ:%=%.json) +HDR=markov.hpp +BIN=test + +all: $(BIN) compile_commands.json + +$(BIN): $(OBJ) + $(CXX) $(CXXFLAGS) $(LDFLAGS) $(OBJ) -o $@ + +$(OBJ): %.o: % $(HDR) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +$(JSON): %.o.json: % Makefile + clang $(CXXFLAGS) -c $< -o /dev/null -MJ $@ + +compile_commands.json: $(JSON) + sed -e '1s/^/[\n/' -e 's/,$$/\n]/' $^ > $@ + + + +clean: + rm -f $(BIN) $(OBJ) $(JSON) + +.PHONY: clean diff --git a/markov.hpp b/markov.hpp new file mode 100644 index 0000000..303baa5 --- /dev/null +++ b/markov.hpp @@ -0,0 +1,129 @@ +#pragma once +#include +#include +#include +#include + +/* Marcov class. + T: Element type + ORDER: Prefix length. A markov chain of order n uses n elements to give a +probability for the next element +*/ +template class Markov { +public: + template friend class Markov; + + using Child = Markov; + using Children = std::map; + + /* Total number of entries for this node + If not a leaf, also the sum of the totals of its children. + */ + size_t total = 0; + /* A map of a key to child nodes */ + Children children; + + Markov() = default; + + constexpr size_t order() const { return ORDER; } + + /* Add the start of iter to the model + Must have sufficient size (at least order + 1) + */ + template void add(IT iter, size_t size) { + if (size <= ORDER) { + return; + } + total++; + find(*iter)->second.add(iter + 1, size - 1); + } + + /* Decrements the occurences of the start of iter by 1. + Must have sufficient size (at least order + 1) + Returns true if the sequence could be found in the model + */ + template bool dec(IT iter, size_t size) { + if (size < ORDER) { + return false; + } + + total--; + iter++; + + auto child = children.find(*iter); + if (child != children.end()) { + if (child->dec(iter, size - 1)) { + if (child->total == 0) { + children.erase(child); + } + return true; + } + } + return false; + } + +protected: + /* Finds the key or adds it */ + typename Children::iterator find(const T &key) { + return children.try_emplace(key).first; + } + +public: + /* Probability of finding the first order() items (but at most size) in the + sequence */ + template double probability(IT iter, size_t size) const { + if (size == 0) { + return 1.; + } + + auto child = children.find(*iter); + if (child != children.cend()) { + auto &val = child->second; + return (double(val.total) / double(total)) * + val.probability(iter + 1, size - 1); + } else { + return 0.5 / double(total); + } + } + + /* Probability of finding the last item (but at most the order+1st) in the + model, given the previous sequence */ + template double final_probability(IT iter, size_t size) const { + if (size == 0) { + return 1.; + } + auto child = children.find(*iter); + if (child != children.cend()) { + auto &val = child->second; + if (size == 1) { + return double(val.total) / double(total); + } else { + return val.final_probability(iter + 1, size - 1); + } + } else { + return 0.5 / double(total); + } + } +}; + +/* Leaf of the recursion with just a count */ +template class Markov::max()> { +public: + size_t total = 0; + Markov() = default; + + constexpr size_t order() const { return 0; } + + template void add(IT, size_t) { total++; } + + template bool dec(IT, size_t) { + total--; + return true; + } + + template double probability(IT, size_t) const { return 1.; } + + template double final_probability(IT, size_t) const { + return 1.; + } +}; diff --git a/test.cpp b/test.cpp new file mode 100644 index 0000000..9c3cc0f --- /dev/null +++ b/test.cpp @@ -0,0 +1,215 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "markov.hpp" + +using Corpus = Markov; + +/* Manage a lookup table for monoalphabetic substitution */ +struct Mapping { + char map[26]{}; + Mapping() = default; + + char &operator[](size_t i) { return map[i]; } + + constexpr auto begin() { return &map[0]; } + + constexpr auto cbegin() const { return &map[0]; } + + constexpr auto cend() const { return &map[26]; } + + /* Create the LUT. Maps all characters except a-z to themselves, uses map for + the rest */ + constexpr void create_map(char *output) const { + for (size_t i = 0; i < 256; i++) { + output[i] = i; + } + for (size_t i = 0; i < 26; i++) { + output['a' + i] = map[i]; + output['A' + i] = map[i]; + } + } + + constexpr auto end() { return &map[26]; } + + /* Swap two output characters */ + bool swap(char a, char b) { + auto left = std::find(begin(), end(), a); + auto right = std::find(begin(), end(), b); + if (left != end() && right != end()) { + std::swap(left, right); + return true; + } + return false; + } +}; +class Decrypt { + /* Markov model of the plaintext */ + Corpus corpus; + + /* Cipher text */ + char const *cipher_text; + + /* Temporary buffer for decryption attempts */ + char *plain_text; + + /* Length of the cipher text */ + size_t len; + +public: + Decrypt(char const *corpus, char const *data) + : corpus(create_corpus(corpus)), cipher_text(data), + plain_text(static_cast(malloc(strlen(data) + 1))), + len(strlen(cipher_text)) { + plain_text[strlen(data)] = 0; + } + + ~Decrypt() { free(plain_text); } + + Mapping find_map(size_t tries, size_t iterations) { + Mapping map; + double best_score = INFINITY; + + while (tries--) { + Mapping m; + double s; + std::tie(m, s) = random_map(iterations); + + if (s < best_score) { + map = m; + best_score = s; + printf("New best try, character score is %g:\n", + best_score / double(len)); + printf(" "); + for (uint32_t i = 0; i < 26; i++) { + putchar(toupper(map[i])); + } + printf("\n"); + printf(" %s\n", apply_map(m)); + } + } + + return map; + } + + char const *apply_map(Mapping const &m) { + char map[256]; + m.create_map(&map[0]); + std::transform(&cipher_text[0], &cipher_text[len], &plain_text[0], + [map](unsigned char a) { return map[a]; }); + + return plain_text; + } + +protected: + Corpus create_corpus(char const *file) { + Corpus m; + std::ifstream corpus{file}; + + char buf[128]; + for (auto i = 0u; i < m.order(); i++) { + buf[i] = ' '; + } + + while (corpus.good()) { + corpus.read(&buf[m.order()], sizeof(buf) - m.order()); + auto buflen = m.order() + corpus.gcount(); + + for (size_t i = 0; i < (buflen - m.order() - 1u); i++) { + m.add(&buf[i], m.order() + 1); + } + } + + return m; + } + + double rate(Mapping const &m, int max_order = -1) const { + const auto order = + (-1 == max_order) ? corpus.order() : static_cast(max_order); + double score = 0.; + char map[256]; + m.create_map(&map[0]); + std::transform(&cipher_text[0], &cipher_text[len], &plain_text[0], + [map](unsigned char a) { return map[a]; }); + for (size_t i = 0, valid = 0; i < len; i++) { + if (plain_text[i]) { + valid++; + if (valid == (order + 1)) { + score += log(corpus.probability(&plain_text[i - valid], valid)); + } else if (valid > (order + 1)) { + score += log( + corpus.final_probability(&plain_text[i - order - 1], order + 1)); + } + } else { + if (valid && (valid <= order)) { + score += log(corpus.probability(&plain_text[i - valid], valid)); + } + valid = 0; + } + } + + return -score; + } + + std::pair random_map(size_t iterations) { + Mapping m; + double current_score{INFINITY}; + for (uint32_t i = 0; i < 26; i++) { + m[i] = 'a' + i; + } + + std::random_device rd; + std::mt19937 g(rd()); + do { + std::shuffle(&m.map[0], &m.map[26], g); + current_score = rate(m); + } while (std::isinf(current_score)); + + std::uniform_int_distribution char_index{0, 25}; + std::uniform_real_distribution random_float; + + int order = 0; + for (uint32_t i = 0; i < iterations; i++) { + Mapping m2 = m; + std::swap(m2[char_index(g)], m2[char_index(g)]); + + int new_order = (corpus.order() * (1 + i) + iterations - 1) / iterations; + if (new_order != order) { + current_score = rate(m); + order = new_order; + } + + auto new_score = rate(m2, order); + double diff = current_score - new_score; + if ((diff > 0) || (random_float(g) < + std::exp(diff * double(1 + i) / double(iterations)))) { + current_score = new_score; + m = m2; + } + } + + return std::pair(m, rate(m)); + } +}; + +int main(int argc, char const **argv) { + if (argc < 3) { + printf("Usage: %s corpus ciphertext\n", argv[0]); + exit(1); + } + printf("Cipher text: %s\n", argv[2]); + Decrypt d(argv[1], argv[2]); + Mapping map = d.find_map(50, 1000); + printf("%s\n", d.apply_map(map)); + + return 0; +}