Added the Markov Chain kata.
This commit is contained in:
parent
6f8c1b55eb
commit
e10dd57471
5 changed files with 1817 additions and 0 deletions
45
markov/README.md
Normal file
45
markov/README.md
Normal file
|
@ -0,0 +1,45 @@
|
|||
# Fun with Markov Chains
|
||||
|
||||
The subject: https://codingdojo.org/kata/MarkovChain/
|
||||
|
||||
Because this is as fun as Large Language Models, but does not need large sets of stolen texts and centillions of GPUs to run.
|
||||
|
||||
## Building the chains
|
||||
|
||||
The `filter.py` must first be used to extract the word probabilities, given the previous word. It behaves like a Unix filter, reading the text to analyse on its standard input and printing the statistics on its standard output.
|
||||
|
||||
Texts that are used for training should be quite clean, and it can be interesting to use `filter.py` with other filters to get the most effective statistics. As an example, the `gen.sh` script proceeds with the following steps.
|
||||
|
||||
- Removing of all the carriage returns and line feeds.
|
||||
- Setting all the characters in lower case.
|
||||
- Adding spaces around commas and periods, so that those punctuation symbols will be used as words.
|
||||
- Running of the filter and registering of the result in a file.
|
||||
|
||||
## Generating surrealistic sequences of words
|
||||
|
||||
The `markov.py` program understands the following arguments.
|
||||
|
||||
- `-f` to provide the path to a statistics file. This is mandatory, of course.
|
||||
- `-w` to provide a first word. If not provided, the first word will be chosen randomly.
|
||||
- `-n`, the number of words to display (including the given one, if any).
|
||||
|
||||
## Notes for testing
|
||||
|
||||
The Python programs rely on the `args` kata, available in a sibling directory. So `PYTHONPATH=../args` should be used to run the programs in place.
|
||||
|
||||
A `poe.txt` file is provided as a sample. It contains excerpts of Tales of the Grotesque and Arabesque from Edgar Allan Poe. They are of course in the public domain.
|
||||
|
||||
> ./gen.sh poe.txt
|
||||
> PYTHONPATH=../args ./markov.py -f poe.txt.stats -w le
|
||||
le pavé de moi , puisqu’elle était un peu près semblable perfection dans les cas qui semblaient n’éprouver aucune donnée pour de leur cœur ⏎
|
||||
|
||||
# Interesting future paths
|
||||
|
||||
The generated statistics is a large dictionary of word/probability, indexed by the previous word.
|
||||
|
||||
By default, words are thus written once as a key, and as many times as they are found elsewhere in the text, with a probability of occurrence. It should be more efficient to associate short identifiers (integers) to the words, and to use them anywhere the word is to found.
|
||||
|
||||
This is what is done by the `filter.py` program when it is given the `-t` argument. However:
|
||||
|
||||
- the `markov.py` program does not understand this format yet.
|
||||
- the result is not very convincing yet because the statistics are stored with JSON (an easy option to begin with), and that this format does not store integers in a concise way.
|
101
markov/filter.py
Executable file
101
markov/filter.py
Executable file
|
@ -0,0 +1,101 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2023 David Soulayrol.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the “Software”), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
import args
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
class Store:
|
||||
def __init__(self):
|
||||
self.__store = {}
|
||||
self.__tags = {}
|
||||
self.__tag_generator = 1000
|
||||
|
||||
def add_word(self, ante, word):
|
||||
self.__store.setdefault(word, {'n': 0, 'sums': {}})
|
||||
|
||||
if len(ante):
|
||||
props = self.__store[ante]
|
||||
|
||||
props['n'] += 1
|
||||
|
||||
if word in props['sums']:
|
||||
props['sums'][word] += 1
|
||||
else:
|
||||
props['sums'][word] = 1
|
||||
|
||||
def get_tag(self, word):
|
||||
tag = self.__tags.get(word)
|
||||
if tag is None:
|
||||
self.__tag_generator += 1
|
||||
tag = self.__tags.setdefault(word, self.__tag_generator)
|
||||
|
||||
return tag
|
||||
|
||||
def export(self):
|
||||
exported = {}
|
||||
|
||||
def compute_statistics(n, sums):
|
||||
stats = {}
|
||||
for w, count in sums.items():
|
||||
stats[w] = count / n
|
||||
|
||||
return stats
|
||||
|
||||
for word, props in self.__store.items():
|
||||
# Compute statistics
|
||||
exported[word] = compute_statistics(props['n'], props['sums'])
|
||||
|
||||
return exported
|
||||
|
||||
def export_with_tags(self):
|
||||
exported = {}
|
||||
|
||||
def compute_statistics(n, sums):
|
||||
stats = {}
|
||||
for w, count in sums.items():
|
||||
stats[self.get_tag(w)] = count / n
|
||||
|
||||
return stats
|
||||
|
||||
for word, props in self.__store.items():
|
||||
# Compute statistics
|
||||
exported[self.get_tag(word)] = {
|
||||
'w': word,
|
||||
's': compute_statistics(props['n'], props['sums'])
|
||||
}
|
||||
|
||||
return exported
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
properties = args.parse('t:tags:b', sys.argv[1:])
|
||||
store = Store()
|
||||
|
||||
ante = ''
|
||||
word = ''
|
||||
|
||||
for c in iter(lambda: sys.stdin.read(1), ''):
|
||||
if c == ' ':
|
||||
store.add_word(ante, word)
|
||||
ante = word
|
||||
word = ''
|
||||
else:
|
||||
word += c
|
||||
|
||||
# Handle the last word
|
||||
store.add_word(ante, word)
|
||||
|
||||
if properties.tags:
|
||||
sys.stdout.write(json.dumps(store.export_with_tags()))
|
||||
else:
|
||||
sys.stdout.write(json.dumps(store.export()))
|
17
markov/gen.sh
Executable file
17
markov/gen.sh
Executable file
|
@ -0,0 +1,17 @@
|
|||
#!/bin/sh
|
||||
|
||||
if [ $# -ne 1 ] && [ $# -ne 2 ]; then
|
||||
echo "Usage: $0 input [filter args...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INPUT="$1"
|
||||
|
||||
shift
|
||||
|
||||
tr -d '()\r\n' < "$INPUT" \
|
||||
|tr '[:upper:]' '[:lower:]' \
|
||||
| sed "s/\.\.\+//g" \
|
||||
| sed "s/\(.\)\([\.,]\)/\1 \2/g" \
|
||||
| PYTHONPATH=../args ./filter.py "$@" \
|
||||
> "$INPUT.stats"
|
48
markov/markov.py
Executable file
48
markov/markov.py
Executable file
|
@ -0,0 +1,48 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2023 David Soulayrol.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the “Software”), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
import args
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
def select_next_word(word, stats):
|
||||
state = stats[word]
|
||||
choices = random.choices(list(state.keys()), weights=list(state.values()))
|
||||
|
||||
return choices[0]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
properties = args.parse('f:filename:s,w:first_word:s,n:words:d', sys.argv[1:])
|
||||
stats = {}
|
||||
|
||||
if len(properties.filename) == 0:
|
||||
print('Missing statistics file.')
|
||||
sys.exit(1)
|
||||
|
||||
with open(properties.filename, 'r') as f:
|
||||
stats = json.loads(f.read())
|
||||
|
||||
n = properties.words if properties.words > 2 else 24
|
||||
word = properties.first_word if len(properties.first_word) \
|
||||
else random.choice(list(stats.keys()))
|
||||
|
||||
if word not in stats.keys():
|
||||
print('Unavailable word "%s".' % word)
|
||||
sys.exit(1)
|
||||
|
||||
print(word, end=' ')
|
||||
|
||||
for i in range(min(n - 1, 1000)):
|
||||
word = select_next_word(word, stats)
|
||||
print(word, end=' ')
|
1606
markov/poe.txt
Normal file
1606
markov/poe.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue