diff --git a/notes-en/assets/licenses-heatmap.png b/notes-en/assets/licenses-heatmap.png
new file mode 100644
index 0000000..832348c
Binary files /dev/null and b/notes-en/assets/licenses-heatmap.png differ
diff --git a/notes-en/ncd.md b/notes-en/ncd.md
index 274c844..687e121 100644
--- a/notes-en/ncd.md
+++ b/notes-en/ncd.md
@@ -41,12 +41,156 @@ So, what can we use? In the original paper authors used real compressors like `Z
## Entropy
+$$S=-\sum _{i}P_{i}\log {P_{i}}$$
## Use entropy in NCD
## Let's practice!
+>>> from textdistance import entropy_ncd
+The same sequences have 0 distance, totally different -- 1:
+>>> entropy_ncd('a', 'a')
+>>> entropy_ncd('a', 'b')
+>>> entropy_ncd('a', 'a' * 40)
+More differences -- higher distance:
+>>> entropy_ncd('text', 'text')
+>>> entropy_ncd('text', 'test')
+>>> entropy_ncd('text', 'nani')
+Distance depends on the size difference between strings:
+>>> entropy_ncd('a', 'bc')
+>>> entropy_ncd('a', 'bcd')
+>>> entropy_ncd('a', 'bbb')
+>>> entropy_ncd('a', 'bbbbbb')
+>>> entropy_ncd('aaaa', 'bbbb')
+Sometimes Entropy-based NCD gives non-intuitive results:
+>>> entropy_ncd('a', 'abbbbbb')
+>>> entropy_ncd('a', 'aaaaaab')
+>>> entropy_ncd('aaaaaaa', 'abbbbbb')
+## Most similar licenses
+Let's compare texts of licenses from [choosealicense.com](https://choosealicense.com/):
+git clone https://github.com/github/choosealicense.com.git
+We will get name of license as command line argument, compare its text with text of each other license and sort results by distance:
+from itertools import islice
+from pathlib import Path
+from sys import argv
+from textdistance import entropy_ncd
+# read files
+licenses = dict()
+for path in Path('choosealicense.com', '_licenses').iterdir():
+ licenses[path.stem] = path.read_text()
+# compare all with one
+compare_with = argv[1]
+distances = dict()
+for name, content in licenses.items():
+ distances[name] = entropy_ncd(
+ licenses[compare_with],
+ content,
+ )
+# show 5 most similar
+sorted_distances = sorted(distances.items(), key=lambda d: d[1])
+for name, distance in islice(sorted_distances, 5):
+ print('{:20} {:.4f}'.format(name, distance))
+Ok, let's have a look on some licenses:
+$ python3 compare.py gpl-3.0
+gpl-3.0 0.0000
+agpl-3.0 0.0013
+osl-3.0 0.0016
+cc0-1.0 0.0020
+lgpl-2.1 0.0022
+$ python3 compare.py mit
+mit 0.0000
+unlicense 0.0031
+bsl-1.0 0.0061
+bsd-3-clause-clear 0.0066
+ncsa 0.0070
+$ python3 tmp.py bsd-2-clause
+bsd-2-clause 0.0000
+postgresql 0.0041
+bsd-3-clause 0.0054
+isc 0.0070
+0bsd 0.0073
+Now, let's make heatmap!
+distances = []
+for name1, content1 in licenses.items():
+ for name2, content2 in licenses.items():
+ distances.append((name1, name2, entropy_ncd(content1, content2)))
+import plotnine as gg
+import pandas as pd
+df = pd.DataFrame(distances, columns=['name1', 'name2', 'distance'])
+ gg.ggplot(df)
+ + gg.geom_tile(gg.aes(x='name1', y='name2', fill='distance'))
+ # reverse colors
+ + gg.scale_fill_continuous(
+ palette=lambda *args: gg.scale_fill_continuous().palette(*args)[::-1],
+ )
+ + gg.theme(
+ figure_size=(12, 8), # make chart bigger
+ axis_text_x=gg.element_text(angle=30), # rotate ox labels
+ )
## Further reading