An attempt of my own path for a while, my devotion to jbaylina for its beautiful algorithm and implementation of C. Here is my attempt at the jbaylina algorithm in Haskell, and below is the further development of my attempt at a linear time algorithm that tries to compress segments that include repeating patterns in sequence:
import Data.Map (fromList, insert, size, (!)) compress s = (foldl f (fromList [(0,([],0)),(1,([s!!0],1))]) [1..n - 1]) ! n where n = length s fbi = insert (size b) bestCandidate b where add (sequence, sLength) (sequence', sLength') = (sequence ++ sequence', sLength + sLength') j' = [1..min 100 i] bestCandidate = foldr combCandidates (b!i `add` ([s!!i,'1'],2)) j' combCandidates j candidate' = let nextCandidate' = comb 2 (b!(i - j + 1) `add` ((take j . drop (i - j + 1) $ s) ++ "1", j + 1)) in if snd nextCandidate' <= snd candidate' then nextCandidate' else candidate' where comb r candidate | r > uBound = candidate | not (strcmp r True) = candidate | snd nextCandidate <= snd candidate = comb (r + 1) nextCandidate | otherwise = comb (r + 1) candidate where uBound = div (i + 1) j prev = b!(i - r * j + 1) nextCandidate = prev `add` ((take j . drop (i - j + 1) $ s) ++ show r, j + length (show r)) strcmp 1 _ = True strcmp num bool | (take j . drop (i - num * j + 1) $ s) == (take j . drop (i - (num - 1) * j + 1) $ s) = strcmp (num - 1) True | otherwise = False
Output:
*Main> compress "aaagctgctagag" ("a3gct2ag2",9) *Main> compress "aaabbbaaabbbaaabbbaaabbb" ("aaabbb4",7)
Linear Attempt:
import Data.List (sortBy) group' xxs sAccum (chr, count) | null xxs = if null chr then singles else if count <= 2 then reverse sAccum ++ multiples ++ "1" else singles ++ if null chr then [] else chr ++ show count | [x] == chr = group' xs sAccum (chr,count + 1) | otherwise = if null chr then group' xs (sAccum) ([x],1) else if count <= 2 then group' xs (multiples ++ sAccum) ([x],1) else singles ++ chr ++ show count ++ group' xs [] ([x],1) where x:xs = xxs singles = reverse sAccum ++ (if null sAccum then [] else "1") multiples = concat (replicate count chr) sequences ws strIndex maxSeqLen = repeated' where half = if null . drop (2 * maxSeqLen - 1) $ ws then div (length ws) 2 else maxSeqLen repeated' = let (sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) = repeated in (sequence,(sequenceStart, sequenceEnd')) repeated = foldr divide ([],(strIndex,strIndex),False) [1..half] equalChunksOf ta = takeWhile(==t) . map (take a) . iterate (drop a) divide chunkSize b@(sequence,(sequenceStart, sequenceEnd'),notSinglesFlag) = let t = take (2*chunkSize) ws t' = take chunkSize t in if t' == drop chunkSize t then let ts = equalChunksOf t' chunkSize ws lenTs = length ts sequenceEnd = strIndex + lenTs * chunkSize newEnd = if sequenceEnd > sequenceEnd' then sequenceEnd else sequenceEnd' in if chunkSize > 1 then if length (group' (concat (replicate lenTs t')) [] ([],0)) > length (t' ++ show lenTs) then (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),True) else b else if notSinglesFlag then b else (((strIndex,sequenceEnd,chunkSize,lenTs),t'):sequence, (sequenceStart,newEnd),False) else b addOne ab | null (fst b) = a | null (fst a) = b | otherwise = let (((start,end,patLen,lenS),sequence):rest,(sStart,sEnd)) = a (((start',end',patLen',lenS'),sequence'):rest',(sStart',sEnd')) = b in if sStart' < sEnd && sEnd < sEnd' then let c = ((start,end,patLen,lenS),sequence):rest d = ((start',end',patLen',lenS'),sequence'):rest' in (c ++ d, (sStart, sEnd')) else a segment xs baseIndex maxSeqLen = segment' xs baseIndex baseIndex where segment' zzs@(z:zs) strIndex farthest | null zs = initial | strIndex >= farthest && strIndex > 0 = ([],(0,0)) | otherwise = addOne initial next where next@(s',(start',end')) = segment' zs (strIndex + 1) farthest' farthest' | null s = farthest | otherwise = if start /= end && end > farthest then end else farthest initial@(s,(start,end)) = sequences zzs strIndex maxSeqLen areExclusive ((a,b,_,_),_) ((a',b',_,_),_) = (a' >= b) || (b' <= a) combs [] r = [r] combs (x:xs) r | null r = combs xs (x:r) ++ if null xs then [] else combs xs r | otherwise = if areExclusive (head r) x then combs xs (x:r) ++ combs xs r else if l' > lowerBound then combs xs (x: reduced : drop 1 r) ++ combs xs r else combs xs r where lowerBound = l + 2 * patLen ((l,u,patLen,lenS),s) = head r ((l',u',patLen',lenS'),s') = x reduce = takeWhile (>=l') . iterate (\x -> x - patLen) $ u lenReduced = length reduce reduced = ((l,u - lenReduced * patLen,patLen,lenS - lenReduced),s) buildString origStr sequences = buildString' origStr sequences 0 (0,"",0) where buildString' origStr sequences index accum@(lenC,cStr,lenOrig) | null sequences = accum | l /= index = buildString' (drop l' origStr) sequences l (lenC + l' + 1, cStr ++ take l' origStr ++ "1", lenOrig + l') | otherwise = buildString' (drop u' origStr) rest u (lenC + length s', cStr ++ s', lenOrig + u') where l' = l - index u' = u - ls' = s ++ show lenS (((l,u,patLen,lenS),s):rest) = sequences compress [] _ accum = reverse accum ++ (if null accum then [] else "1") compress zzs@(z:zs) maxSeqLen accum | null (fst segment') = compress zs maxSeqLen (z:accum) | (start,end) == (0,2) && not (null accum) = compress zs maxSeqLen (z:accum) | otherwise = reverse accum ++ (if null accum || takeWhile' compressedStr 0 /= 0 then [] else "1") ++ compressedStr ++ compress (drop lengthOriginal zzs) maxSeqLen [] where segment'@(s,(start,end)) = segment zzs 0 maxSeqLen combinations = combs (fst $ segment') [] takeWhile' xxs count | null xxs = 0 | x == '1' && null (reads (take 1 xs)::[(Int,String)]) = count | not (null (reads [x]::[(Int,String)])) = 0 | otherwise = takeWhile' xs (count + 1) where x:xs = xxs f (lenC,cStr,lenOrig) (lenC',cStr',lenOrig') = let g = compare ((fromIntegral lenC + if not (null accum) && takeWhile' cStr 0 == 0 then 1 else 0) / fromIntegral lenOrig) ((fromIntegral lenC' + if not (null accum) && takeWhile' cStr' 0 == 0 then 1 else 0) / fromIntegral lenOrig') in if g == EQ then compare (takeWhile' cStr' 0) (takeWhile' cStr 0) else g (lenCompressed,compressedStr,lengthOriginal) = head $ sortBy f (map (buildString (take end zzs)) (map reverse combinations))
Output:
*Main> compress "aaaaaaaaabbbbbbbbbaaaaaaaaabbbbbbbbb" 100 [] "a9b9a9b9" *Main> compress "aaabbbaaabbbaaabbbaaabbb" 100 [] "aaabbb4"