250 lines
8.7 KiB
Python
250 lines
8.7 KiB
Python
|
#!/usr/bin/env python
|
||
|
# https://gist.github.com/hynekcer/fa340f3b63826168ffc0c4b33310ae9c
|
||
|
"""Find the longest repeated substring.
|
||
|
"Efficient way to find longest duplicate string for Python (From Programming Pearls)"
|
||
|
http://stackoverflow.com/questions/13560037/
|
||
|
|
||
|
The algorithm is based on "Prefix doubling".
|
||
|
The worst time complexity is O(n (log n)^2). Memory requirements are linear.
|
||
|
"""
|
||
|
|
||
|
import time
|
||
|
|
||
|
from random import randint
|
||
|
import itertools
|
||
|
import sys
|
||
|
import unittest
|
||
|
from itertools import groupby
|
||
|
from operator import itemgetter
|
||
|
import logging
|
||
|
|
||
|
log = logging.getLogger(__name__)
|
||
|
log.setLevel(logging.INFO)
|
||
|
try:
|
||
|
log.addHandler(logging.NullHandler())
|
||
|
except AttributeError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
def run():
|
||
|
if sys.argv[1:] == ["-"]:
|
||
|
text = sys.stdin.read()
|
||
|
elif sys.argv[1:]:
|
||
|
print("Reading data...")
|
||
|
text = open(sys.argv[1]).read()
|
||
|
else:
|
||
|
text = "banana"
|
||
|
print("Sorting...")
|
||
|
result = longest_common_substring(text)
|
||
|
print('Longest common substrings in "{0}..." are:\n{1}'.format(text[:20], result))
|
||
|
|
||
|
|
||
|
def longest_common_substring(text):
|
||
|
"""Get the longest common substrings and their positions.
|
||
|
>>> longest_common_substring('banana')
|
||
|
{'ana': [1, 3]}
|
||
|
>>> text = "not so Agamemnon, who spoke fiercely to "
|
||
|
>>> sorted(longest_common_substring(text).items())
|
||
|
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])]
|
||
|
|
||
|
This function can be easy modified for any criteria, e.g. for searching ten
|
||
|
longest non overlapping repeated substrings.
|
||
|
"""
|
||
|
sa, rsa, lcp = suffix_array(text)
|
||
|
maxlen = max(lcp)
|
||
|
result = {}
|
||
|
for i in range(1, len(text)):
|
||
|
if lcp[i] == maxlen:
|
||
|
j1, j2, h = sa[i - 1], sa[i], lcp[i]
|
||
|
assert text[j1 : j1 + h] == text[j2 : j2 + h]
|
||
|
substring = text[j1 : j1 + h]
|
||
|
if substring not in result:
|
||
|
result[substring] = [j1]
|
||
|
result[substring].append(j2)
|
||
|
return dict((k, sorted(v)) for k, v in result.items())
|
||
|
|
||
|
|
||
|
def suffix_array(text, _step=16):
|
||
|
"""Analyze all common strings in the text.
|
||
|
|
||
|
Short substrings of the length _step a are first pre-sorted. The are the
|
||
|
results repeatedly merged so that the garanteed number of compared
|
||
|
characters bytes is doubled in every iteration until all substrings are
|
||
|
sorted exactly.
|
||
|
Arguments:
|
||
|
text: The text to be analyzed.
|
||
|
_step: Is only for optimization and testing. It is the optimal length
|
||
|
of substrings used for initial pre-sorting. The bigger value is
|
||
|
faster if there is enough memory. Memory requirements are
|
||
|
approximately (estimate for 32 bit Python 3.3):
|
||
|
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB
|
||
|
|
||
|
Return value: (tuple)
|
||
|
(sa, rsa, lcp)
|
||
|
sa: Suffix array for i in range(1, size):
|
||
|
assert text[sa[i-1]:] < text[sa[i]:]
|
||
|
rsa: Reverse suffix array for i in range(size):
|
||
|
assert rsa[sa[i]] == i
|
||
|
lcp: Longest common prefix for i in range(1, size):
|
||
|
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]]
|
||
|
if sa[i-1] + lcp[i] < len(text):
|
||
|
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]]
|
||
|
>>> suffix_array(text='banana')
|
||
|
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
|
||
|
|
||
|
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
|
||
|
The Longest Common String is 'ana': lcp[2] == 3 == len('ana')
|
||
|
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:]
|
||
|
"""
|
||
|
tx = text
|
||
|
t0 = time.time()
|
||
|
size = len(tx)
|
||
|
step = min(max(_step, 1), len(tx))
|
||
|
sa = list(range(len(tx)))
|
||
|
log.debug("%6.3f pre sort", time.time() - t0)
|
||
|
sa.sort(key=lambda i: tx[i : i + step])
|
||
|
log.debug("%6.3f after sort", time.time() - t0)
|
||
|
grpstart = size * [False] + [True] # a boolean map for iteration speedup.
|
||
|
# It helps to skip yet resolved values. The last value True is a sentinel.
|
||
|
rsa = size * [None]
|
||
|
stgrp, igrp = "", 0
|
||
|
for i, pos in enumerate(sa):
|
||
|
st = tx[pos : pos + step]
|
||
|
if st != stgrp:
|
||
|
grpstart[igrp] = igrp < i - 1
|
||
|
stgrp = st
|
||
|
igrp = i
|
||
|
rsa[pos] = igrp
|
||
|
sa[i] = pos
|
||
|
grpstart[igrp] = igrp < size - 1 or size == 0
|
||
|
log.debug("%6.3f after group", time.time() - t0)
|
||
|
while grpstart.index(True) < size:
|
||
|
# assert step <= size
|
||
|
nmerge = 0
|
||
|
nextgr = grpstart.index(True)
|
||
|
while nextgr < size:
|
||
|
igrp = nextgr
|
||
|
nextgr = grpstart.index(True, igrp + 1)
|
||
|
glist = []
|
||
|
for ig in range(igrp, nextgr):
|
||
|
pos = sa[ig]
|
||
|
if rsa[pos] != igrp:
|
||
|
break
|
||
|
newgr = rsa[pos + step] if pos + step < size else -1
|
||
|
glist.append((newgr, pos))
|
||
|
glist.sort()
|
||
|
for ig, g in groupby(glist, key=itemgetter(0)):
|
||
|
g = [x[1] for x in g]
|
||
|
sa[igrp : igrp + len(g)] = g
|
||
|
grpstart[igrp] = len(g) > 1
|
||
|
for pos in g:
|
||
|
rsa[pos] = igrp
|
||
|
igrp += len(g)
|
||
|
nmerge += len(glist)
|
||
|
log.debug("%6.3f for step=%d nmerge=%d", time.time() - t0, step, nmerge)
|
||
|
step *= 2
|
||
|
del grpstart
|
||
|
# create LCP array
|
||
|
lcp = size * [None]
|
||
|
h = 0
|
||
|
for i in range(size):
|
||
|
if rsa[i] > 0:
|
||
|
j = sa[rsa[i] - 1]
|
||
|
while i != size - h and j != size - h and tx[i + h] == tx[j + h]:
|
||
|
h += 1
|
||
|
lcp[rsa[i]] = h
|
||
|
if h > 0:
|
||
|
h -= 1
|
||
|
if size > 0:
|
||
|
lcp[0] = 0
|
||
|
log.debug("%6.3f end", time.time() - t0)
|
||
|
return sa, rsa, lcp
|
||
|
|
||
|
|
||
|
# ---
|
||
|
|
||
|
|
||
|
class TestMixin(object):
|
||
|
def suffix_verify(self, text, step=16):
|
||
|
tx = text
|
||
|
sa, rsa, lcp = suffix_array(text=tx, _step=step)
|
||
|
self.assertEqual(set(sa), set(range(len(tx))))
|
||
|
ok = True
|
||
|
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]):
|
||
|
self.assertEqual(
|
||
|
tx[i1 : i1 + h],
|
||
|
tx[i0 : i0 + h],
|
||
|
"Verify LCP characters equal on text '%s...'" % text[:20],
|
||
|
)
|
||
|
self.assertGreater(
|
||
|
tx[i1 + h : i1 + h + 1],
|
||
|
tx[i0 + h : i0 + h + 1],
|
||
|
"Verify LCP+1 char is different '%s...'" % text[:20],
|
||
|
)
|
||
|
self.assertLessEqual(
|
||
|
max(i0, i1),
|
||
|
len(tx) - h,
|
||
|
"Verify LCP is not more than length of string '%s...'" % text[:20],
|
||
|
)
|
||
|
self.assertTrue(ok)
|
||
|
|
||
|
|
||
|
class SuffixArrayTest(unittest.TestCase, TestMixin):
|
||
|
def test_16(self):
|
||
|
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
|
||
|
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
|
||
|
self.assertEqual(suffix_array(text="banana", _step=16), expect)
|
||
|
|
||
|
def test_1(self):
|
||
|
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
|
||
|
self.assertEqual(suffix_array(text="banana", _step=1), expect)
|
||
|
|
||
|
def test_mini(self):
|
||
|
self.assertEqual(suffix_array(text="", _step=1), ([], [], []))
|
||
|
self.assertEqual(suffix_array(text="a", _step=1), ([0], [0], [0]))
|
||
|
self.assertEqual(suffix_array(text="aa", _step=1), ([1, 0], [1, 0], [0, 1]))
|
||
|
self.assertEqual(
|
||
|
suffix_array(text="aaa", _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2])
|
||
|
)
|
||
|
|
||
|
def test_example(self):
|
||
|
self.suffix_verify("abracadabra")
|
||
|
|
||
|
def test_cartesian(self):
|
||
|
"""Test all combinations of alphabet "ABC" up to length 4 characters"""
|
||
|
for size in range(7):
|
||
|
for cartesian in itertools.product(*(size * ["ABC"])):
|
||
|
text = "".join(cartesian)
|
||
|
log.debug('Testing "%s"', text)
|
||
|
self.suffix_verify(text, 1)
|
||
|
|
||
|
def test_lcp(self):
|
||
|
expect = {"ana": [1, 3]}
|
||
|
self.assertDictEqual(longest_common_substring("banana"), expect)
|
||
|
expect = {" s": [3, 21], "no": [0, 13], "o ": [5, 20, 38]}
|
||
|
self.assertDictEqual(
|
||
|
longest_common_substring("not so Agamemnon, who spoke fiercely to "), expect
|
||
|
)
|
||
|
|
||
|
|
||
|
class SlowTests(unittest.TestCase, TestMixin):
|
||
|
"""Slow development tests running many minutes.
|
||
|
|
||
|
It can be run only by an EXPLICIT command!
|
||
|
e.g.: python -m unittest maxsubstring.SlowTests._test_random
|
||
|
"""
|
||
|
|
||
|
def _test_random(self):
|
||
|
for power in range(2, 21, 2):
|
||
|
size = randint(2 ** (power - 1), 2**power)
|
||
|
for alphabet in (2, 4, 16, 256):
|
||
|
text = "".join(chr(65 + randint(0, alphabet - 1)) for _ in range(size))
|
||
|
log.debug("%s %s %s", size, alphabet, 1)
|
||
|
self.suffix_verify(text, 1)
|
||
|
log.debug("%s %s %s", size, alphabet, 16)
|
||
|
self.suffix_verify(text, 16)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
run()
|