#!/usr/bin/env python # https://gist.github.com/hynekcer/fa340f3b63826168ffc0c4b33310ae9c """Find the longest repeated substring. "Efficient way to find longest duplicate string for Python (From Programming Pearls)" http://stackoverflow.com/questions/13560037/ The algorithm is based on "Prefix doubling". The worst time complexity is O(n (log n)^2). Memory requirements are linear. """ import time from random import randint import itertools import sys import unittest from itertools import groupby from operator import itemgetter import logging log = logging.getLogger(__name__) log.setLevel(logging.INFO) try: log.addHandler(logging.NullHandler()) except AttributeError: pass def run(): if sys.argv[1:] == ["-"]: text = sys.stdin.read() elif sys.argv[1:]: print("Reading data...") text = open(sys.argv[1]).read() else: text = "banana" print("Sorting...") result = longest_common_substring(text) print('Longest common substrings in "{0}..." are:\n{1}'.format(text[:20], result)) def longest_common_substring(text): """Get the longest common substrings and their positions. >>> longest_common_substring('banana') {'ana': [1, 3]} >>> text = "not so Agamemnon, who spoke fiercely to " >>> sorted(longest_common_substring(text).items()) [(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])] This function can be easy modified for any criteria, e.g. for searching ten longest non overlapping repeated substrings. """ sa, rsa, lcp = suffix_array(text) maxlen = max(lcp) result = {} for i in range(1, len(text)): if lcp[i] == maxlen: j1, j2, h = sa[i - 1], sa[i], lcp[i] assert text[j1 : j1 + h] == text[j2 : j2 + h] substring = text[j1 : j1 + h] if substring not in result: result[substring] = [j1] result[substring].append(j2) return dict((k, sorted(v)) for k, v in result.items()) def suffix_array(text, _step=16): """Analyze all common strings in the text. Short substrings of the length _step a are first pre-sorted. The are the results repeatedly merged so that the garanteed number of compared characters bytes is doubled in every iteration until all substrings are sorted exactly. Arguments: text: The text to be analyzed. _step: Is only for optimization and testing. It is the optimal length of substrings used for initial pre-sorting. The bigger value is faster if there is enough memory. Memory requirements are approximately (estimate for 32 bit Python 3.3): len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB Return value: (tuple) (sa, rsa, lcp) sa: Suffix array for i in range(1, size): assert text[sa[i-1]:] < text[sa[i]:] rsa: Reverse suffix array for i in range(size): assert rsa[sa[i]] == i lcp: Longest common prefix for i in range(1, size): assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]] if sa[i-1] + lcp[i] < len(text): assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]] >>> suffix_array(text='banana') ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' The Longest Common String is 'ana': lcp[2] == 3 == len('ana') It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:] """ tx = text t0 = time.time() size = len(tx) step = min(max(_step, 1), len(tx)) sa = list(range(len(tx))) log.debug("%6.3f pre sort", time.time() - t0) sa.sort(key=lambda i: tx[i : i + step]) log.debug("%6.3f after sort", time.time() - t0) grpstart = size * [False] + [True] # a boolean map for iteration speedup. # It helps to skip yet resolved values. The last value True is a sentinel. rsa = size * [None] stgrp, igrp = "", 0 for i, pos in enumerate(sa): st = tx[pos : pos + step] if st != stgrp: grpstart[igrp] = igrp < i - 1 stgrp = st igrp = i rsa[pos] = igrp sa[i] = pos grpstart[igrp] = igrp < size - 1 or size == 0 log.debug("%6.3f after group", time.time() - t0) while grpstart.index(True) < size: # assert step <= size nmerge = 0 nextgr = grpstart.index(True) while nextgr < size: igrp = nextgr nextgr = grpstart.index(True, igrp + 1) glist = [] for ig in range(igrp, nextgr): pos = sa[ig] if rsa[pos] != igrp: break newgr = rsa[pos + step] if pos + step < size else -1 glist.append((newgr, pos)) glist.sort() for ig, g in groupby(glist, key=itemgetter(0)): g = [x[1] for x in g] sa[igrp : igrp + len(g)] = g grpstart[igrp] = len(g) > 1 for pos in g: rsa[pos] = igrp igrp += len(g) nmerge += len(glist) log.debug("%6.3f for step=%d nmerge=%d", time.time() - t0, step, nmerge) step *= 2 del grpstart # create LCP array lcp = size * [None] h = 0 for i in range(size): if rsa[i] > 0: j = sa[rsa[i] - 1] while i != size - h and j != size - h and tx[i + h] == tx[j + h]: h += 1 lcp[rsa[i]] = h if h > 0: h -= 1 if size > 0: lcp[0] = 0 log.debug("%6.3f end", time.time() - t0) return sa, rsa, lcp # --- class TestMixin(object): def suffix_verify(self, text, step=16): tx = text sa, rsa, lcp = suffix_array(text=tx, _step=step) self.assertEqual(set(sa), set(range(len(tx)))) ok = True for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]): self.assertEqual( tx[i1 : i1 + h], tx[i0 : i0 + h], "Verify LCP characters equal on text '%s...'" % text[:20], ) self.assertGreater( tx[i1 + h : i1 + h + 1], tx[i0 + h : i0 + h + 1], "Verify LCP+1 char is different '%s...'" % text[:20], ) self.assertLessEqual( max(i0, i1), len(tx) - h, "Verify LCP is not more than length of string '%s...'" % text[:20], ) self.assertTrue(ok) class SuffixArrayTest(unittest.TestCase, TestMixin): def test_16(self): # 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana' expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) self.assertEqual(suffix_array(text="banana", _step=16), expect) def test_1(self): expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2]) self.assertEqual(suffix_array(text="banana", _step=1), expect) def test_mini(self): self.assertEqual(suffix_array(text="", _step=1), ([], [], [])) self.assertEqual(suffix_array(text="a", _step=1), ([0], [0], [0])) self.assertEqual(suffix_array(text="aa", _step=1), ([1, 0], [1, 0], [0, 1])) self.assertEqual( suffix_array(text="aaa", _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2]) ) def test_example(self): self.suffix_verify("abracadabra") def test_cartesian(self): """Test all combinations of alphabet "ABC" up to length 4 characters""" for size in range(7): for cartesian in itertools.product(*(size * ["ABC"])): text = "".join(cartesian) log.debug('Testing "%s"', text) self.suffix_verify(text, 1) def test_lcp(self): expect = {"ana": [1, 3]} self.assertDictEqual(longest_common_substring("banana"), expect) expect = {" s": [3, 21], "no": [0, 13], "o ": [5, 20, 38]} self.assertDictEqual( longest_common_substring("not so Agamemnon, who spoke fiercely to "), expect ) class SlowTests(unittest.TestCase, TestMixin): """Slow development tests running many minutes. It can be run only by an EXPLICIT command! e.g.: python -m unittest maxsubstring.SlowTests._test_random """ def _test_random(self): for power in range(2, 21, 2): size = randint(2 ** (power - 1), 2**power) for alphabet in (2, 4, 16, 256): text = "".join(chr(65 + randint(0, alphabet - 1)) for _ in range(size)) log.debug("%s %s %s", size, alphabet, 1) self.suffix_verify(text, 1) log.debug("%s %s %s", size, alphabet, 16) self.suffix_verify(text, 16) if __name__ == "__main__": run()