2023-07-09 23:00:43 +00:00
#!/usr/bin/env python
# https://gist.github.com/hynekcer/fa340f3b63826168ffc0c4b33310ae9c
"""Find the longest repeated substring.
"Efficient way to find longest duplicate string for Python (From Programming Pearls)"
The algorithm is based on "Prefix doubling".
The worst time complexity is O(n (log n)^2). Memory requirements are linear.
import time
from random import randint
import itertools
import sys
import unittest
from itertools import groupby
from operator import itemgetter
import logging
log = logging.getLogger(__name__)
except AttributeError:
def run():
if sys.argv[1:] == ["-"]:
text = sys.stdin.read()
elif sys.argv[1:]:
print("Reading data...")
text = open(sys.argv[1]).read()
text = "banana"
result = longest_common_substring(text)
print('Longest common substrings in "{0}..." are:\n{1}'.format(text[:20], result))
def longest_common_substring(text):
"""Get the longest common substrings and their positions.
>>> longest_common_substring('banana')
{'ana': [1, 3]}
>>> text = "not so Agamemnon, who spoke fiercely to "
>>> sorted(longest_common_substring(text).items())
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])]
This function can be easy modified for any criteria, e.g. for searching ten
longest non overlapping repeated substrings.
sa, rsa, lcp = suffix_array(text)
maxlen = max(lcp)
result = {}
for i in range(1, len(text)):
if lcp[i] == maxlen:
j1, j2, h = sa[i - 1], sa[i], lcp[i]
assert text[j1 : j1 + h] == text[j2 : j2 + h]
substring = text[j1 : j1 + h]
if substring not in result:
result[substring] = [j1]
return dict((k, sorted(v)) for k, v in result.items())
def suffix_array(text, _step=16):
"""Analyze all common strings in the text.
Short substrings of the length _step a are first pre-sorted. The are the
results repeatedly merged so that the garanteed number of compared
characters bytes is doubled in every iteration until all substrings are
sorted exactly.
text: The text to be analyzed.
_step: Is only for optimization and testing. It is the optimal length
of substrings used for initial pre-sorting. The bigger value is
faster if there is enough memory. Memory requirements are
approximately (estimate for 32 bit Python 3.3):
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB
Return value: (tuple)
(sa, rsa, lcp)
sa: Suffix array for i in range(1, size):
assert text[sa[i-1]:] < text[sa[i]:]
rsa: Reverse suffix array for i in range(size):
assert rsa[sa[i]] == i
lcp: Longest common prefix for i in range(1, size):
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]]
if sa[i-1] + lcp[i] < len(text):
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]]
>>> suffix_array(text='banana')
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
The Longest Common String is 'ana': lcp[2] == 3 == len('ana')
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:]
tx = text
t0 = time.time()
size = len(tx)
step = min(max(_step, 1), len(tx))
sa = list(range(len(tx)))
log.debug("%6.3f pre sort", time.time() - t0)
sa.sort(key=lambda i: tx[i : i + step])
log.debug("%6.3f after sort", time.time() - t0)
grpstart = size * [False] + [True] # a boolean map for iteration speedup.
# It helps to skip yet resolved values. The last value True is a sentinel.
rsa = size * [None]
stgrp, igrp = "", 0
for i, pos in enumerate(sa):
st = tx[pos : pos + step]
if st != stgrp:
grpstart[igrp] = igrp < i - 1
stgrp = st
igrp = i
rsa[pos] = igrp
sa[i] = pos
grpstart[igrp] = igrp < size - 1 or size == 0
log.debug("%6.3f after group", time.time() - t0)
while grpstart.index(True) < size:
# assert step <= size
nmerge = 0
nextgr = grpstart.index(True)
while nextgr < size:
igrp = nextgr
nextgr = grpstart.index(True, igrp + 1)
glist = []
for ig in range(igrp, nextgr):
pos = sa[ig]
if rsa[pos] != igrp:
newgr = rsa[pos + step] if pos + step < size else -1
glist.append((newgr, pos))
for ig, g in groupby(glist, key=itemgetter(0)):
g = [x[1] for x in g]
sa[igrp : igrp + len(g)] = g
grpstart[igrp] = len(g) > 1
for pos in g:
rsa[pos] = igrp
igrp += len(g)
nmerge += len(glist)
log.debug("%6.3f for step=%d nmerge=%d", time.time() - t0, step, nmerge)
step *= 2
del grpstart
# create LCP array
lcp = size * [None]
h = 0
for i in range(size):
if rsa[i] > 0:
j = sa[rsa[i] - 1]
while i != size - h and j != size - h and tx[i + h] == tx[j + h]:
h += 1
lcp[rsa[i]] = h
if h > 0:
h -= 1
if size > 0:
lcp[0] = 0
log.debug("%6.3f end", time.time() - t0)
return sa, rsa, lcp
# ---
class TestMixin(object):
def suffix_verify(self, text, step=16):
tx = text
sa, rsa, lcp = suffix_array(text=tx, _step=step)
self.assertEqual(set(sa), set(range(len(tx))))
ok = True
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]):
tx[i1 : i1 + h],
tx[i0 : i0 + h],
"Verify LCP characters equal on text '%s...'" % text[:20],
tx[i1 + h : i1 + h + 1],
tx[i0 + h : i0 + h + 1],
"Verify LCP+1 char is different '%s...'" % text[:20],
max(i0, i1),
len(tx) - h,
"Verify LCP is not more than length of string '%s...'" % text[:20],
class SuffixArrayTest(unittest.TestCase, TestMixin):
def test_16(self):
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text="banana", _step=16), expect)
def test_1(self):
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text="banana", _step=1), expect)
def test_mini(self):
self.assertEqual(suffix_array(text="", _step=1), ([], [], []))
self.assertEqual(suffix_array(text="a", _step=1), ([0], [0], [0]))
self.assertEqual(suffix_array(text="aa", _step=1), ([1, 0], [1, 0], [0, 1]))
suffix_array(text="aaa", _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2])
def test_example(self):
def test_cartesian(self):
"""Test all combinations of alphabet "ABC" up to length 4 characters"""
for size in range(7):
for cartesian in itertools.product(*(size * ["ABC"])):
text = "".join(cartesian)
log.debug('Testing "%s"', text)
self.suffix_verify(text, 1)
def test_lcp(self):
expect = {"ana": [1, 3]}
self.assertDictEqual(longest_common_substring("banana"), expect)
expect = {" s": [3, 21], "no": [0, 13], "o ": [5, 20, 38]}
longest_common_substring("not so Agamemnon, who spoke fiercely to "), expect
class SlowTests(unittest.TestCase, TestMixin):
"""Slow development tests running many minutes.
It can be run only by an EXPLICIT command!
e.g.: python -m unittest maxsubstring.SlowTests._test_random
def _test_random(self):
for power in range(2, 21, 2):
size = randint(2 ** (power - 1), 2**power)
for alphabet in (2, 4, 16, 256):
text = "".join(chr(65 + randint(0, alphabet - 1)) for _ in range(size))
log.debug("%s %s %s", size, alphabet, 1)
self.suffix_verify(text, 1)
log.debug("%s %s %s", size, alphabet, 16)
self.suffix_verify(text, 16)
if __name__ == "__main__":