Randomly select a specific subsequence from a string

For a string containing several characters alternating with a dash, for example string s = "A--TG-DF----GR--";, I want to randomly select a dash block (it can have a size of 1, 2, ..., the maximum number of consecutive dashes in a string) and copy them to another part of the randomly selected string .

For example, A--TG-DF(---)-GR--moves in A--T(---)G-DF-GR--, while another iteration can give A--TG-DF----GR(--)moves in A--TG-(--)DF----GR.

I generate random row indices through int i = rand() % (int) s.length();. Insertion occurs through s.insert(rand() % (int) s.length(), substr);, where substris the dash block.

My main problem is to accidentally find a continuous dash group. I was thinking about using s.find("-"), but this will only return the first instance of a single dash, not the random position of the dash set.

+2
source share
4 answers

I know that this problem is most likely immersed in XY problems , but I found it a pleasant problem, however, I thought about its implementation using the Boost Interval Container Library.

The beauty of the library is that you can forget a lot, while you do not sacrifice a lot of performance.

, ( ).

Live On Coliru 1 000 000 (1..3) 2673 (1156 ):

Generator gen(test_case);

std::string result;
std::map<std::string, size_t> histo;

for(int i = 0; i < 1000000; ++i) {
    auto const mobility = gen.gen_relocations(1 + gen.random(3)); // move n blocks of dashes

    result.clear();
    gen.apply_relocations(mobility, std::back_inserter(result));

    histo[result]++;
}

: ,

, :

  • "" :

    namespace icl = boost::icl;
    
    using Position = size_t;
    using Map      = icl::interval_set<Position>;
    using Region   = Map::value_type;
    

    . , Map, :

    template <typename It> Map region_map(It f, It l) {
        Map dashes;
    
        for (Position pos = 0; f!=l; ++f, ++pos)
            if ('-' == *f)
                dashes += pos;
    
        return dashes;
    }
    

    , . _set . , . KISS .

  • , a Region Position .

    using Relocs   = boost::container::flat_multimap<Position, Region>;
    

    , , . reserve() -ed , node.

  • , :

    Map pick_dashes(int n) {
        Map map;
        if (!_dashes.empty())
            for (int i = 0; i < n; ++i)
                map += *std::next(_dashes.begin(), _select(_rng));
        return map;
    }
    

    , :

      _dashes(region_map(_input.begin(), _input.end())),
      _rng(std::random_device {}()),
      _select (0, _dashes.iterative_size() - 1),
      _randpos(0, _input.size() - 1),
    
  • . () .

    • , .
    • , , :

        _is_degenerate(cardinality(_dashes) == _input.size())
      

    , :

    Relocs gen_relocations(int n=1) {
        Map const moving = pick_dashes(n);
    
        Relocs relocs;
        relocs.reserve(moving.iterative_size());
    
        if (_is_degenerate)
        {
            // degenerate case (everything is a dash); no non-moving positions
            // exist, just pick 0
            for(auto& region : moving)
                relocs.emplace(0, region);
        } else {
            auto inertia = [&] {
                Position inert_point;
                while (contains(moving, inert_point = _randpos(_rng)))
                    ; // discard insertion points that are moving
                return inert_point;
            };
    
            for(auto& region : moving)
                relocs.emplace(inertia(), region);
        }
    
        return relocs;
    }
    

    , , .

  • . , , (KISS):

    template <typename F>
        void do_apply_relocations(Relocs const& mobility, F const& apply) const {
            icl::split_interval_set<Position> steps {{0, _input.size()}};
    
            for (auto& m : mobility) {
                steps += m.first; // force a split on insertion point
                steps -= m.second; // remove the source of the move
                //std::cout << m.second << " moving to #" << m.first << ": " << steps << "\n";
            }
    
            auto next_mover = mobility.begin();
    
            for(auto& step : steps) {
                while (next_mover!=mobility.end() && contains(step, next_mover->first))
                    apply((next_mover++)->second, true);
    
                apply(step, false);
            }
        }
    

    . , "" split_interval_set, , "" : "" .

  • apply , , , . A--TG-DFGR(----)--, , (, std::string), :

    template <typename Out>
        Out apply_relocations(Relocs const& mobility, Out out) const {
            if (_is_degenerate)
                return std::copy(_input.begin(), _input.end(), out);
    
            auto to_iter = [this](Position pos) { return _input.begin() + pos; };
    
            do_apply_relocations(mobility, [&](Region const& r, bool relocated) {
                if (relocated) *out++ = '(';
                out = std::copy(
                    to_iter(first(r)),
                    to_iter(last(r)+1),
                    out
                );
                if (relocated) *out++ = ')';
            });
    
            return out;
        }
    

    . "" Position (to_iter) () .

.

#include <boost/container/flat_map.hpp>
#include <boost/icl/interval_set.hpp>
#include <boost/icl/split_interval_set.hpp>
#include <boost/icl/separate_interval_set.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/range/algorithm.hpp>
#include <iomanip>
#include <iostream>
#include <random>
#include <map>
#include <chrono>

namespace icl = boost::icl;

using Position = size_t;
using Map      = icl::interval_set<Position>;
using Region   = Map::value_type;
using Relocs   = boost::container::flat_multimap<Position, Region>;

struct Generator {
    Generator(std::string const& input) 
        : _input(input),
          _dashes(region_map(_input.begin(), _input.end())),
          _rng(std::random_device {}()),
          _select (0, _dashes.iterative_size() - 1),
          _randpos(0, _input.size() - 1),
          _is_degenerate(cardinality(_dashes) == _input.size())
    {
    }

    unsigned random(unsigned below) {
        return _rng() % below; // q&d, only here to make the tests deterministic for a fixed seed
    }

    Map full() const { 
        return Map { { 0, _input.size() } };
    }

    Relocs gen_relocations(int n=1) {
        Map const moving = pick_dashes(n);

        Relocs relocs;
        relocs.reserve(moving.iterative_size());

        if (_is_degenerate)
        {
            // degenerate case (everything is a dash); no non-moving positions
            // exist, just pick 0
            for(auto& region : moving)
                relocs.emplace(0, region);
        } else {
            auto inertia = [&] {
                Position inert_point;
                while (contains(moving, inert_point = _randpos(_rng)))
                    ; // discard insertion points that are moving
                return inert_point;
            };

            for(auto& region : moving)
                relocs.emplace(inertia(), region);
        }

        return relocs;
    }

    template <typename Out>
        Out apply_relocations(Relocs const& mobility, Out out) const {
            if (_is_degenerate)
                return std::copy(_input.begin(), _input.end(), out);

            auto to_iter = [this](Position pos) { return _input.begin() + pos; };

            do_apply_relocations(mobility, [&](Region const& r, bool relocated) {
                if (relocated) *out++ = '(';
                out = std::copy(
                    to_iter(first(r)),
                    to_iter(last(r)+1),
                    out
                );
                if (relocated) *out++ = ')';
            });

            return out;
        }

    template <typename F>
        void do_apply_relocations(Relocs const& mobility, F const& apply) const {
            icl::split_interval_set<Position> steps {{0, _input.size()}};

            for (auto& m : mobility) {
                steps += m.first; // force a split on insertion point
                steps -= m.second; // remove the source of the move
                //std::cout << m.second << " moving to #" << m.first << ": " << steps << "\n";
            }

            auto next_mover = mobility.begin();

            for(auto& step : steps) {
                while (next_mover!=mobility.end() && contains(step, next_mover->first))
                    apply((next_mover++)->second, true);

                apply(step, false);
            }
        }

  private:
    std::string                             _input;
    Map                                     _dashes;
    std::mt19937                            _rng;
    std::uniform_int_distribution<Position> _select;
    std::uniform_int_distribution<Position> _randpos;
    bool                                    _is_degenerate;

    Map pick_dashes(int n) {
        Map map;
        if (!_dashes.empty())
            for (int i = 0; i < n; ++i)
                map += *std::next(_dashes.begin(), _select(_rng));
        return map;
    }

    template <typename It> Map region_map(It f, It l) {
        Map dashes;

        for (Position pos = 0; f!=l; ++f, ++pos)
            if ('-' == *f)
                dashes += pos;

        return dashes;
    }
};

int main() {

    for (std::string test_case : {
            "----",
            "A--TG-DF----GR--",
            "",
            "ATGDFGR",
        })
    {
        auto start = std::chrono::high_resolution_clock::now();
        Generator gen(test_case);

        std::string result;
        std::map<std::string, size_t> histo;

        for(int i = 0; i < 1000000; ++i) {
            auto const mobility = gen.gen_relocations(1 + gen.random(3)); // move n blocks of dashes

            result.clear();
            gen.apply_relocations(mobility, std::back_inserter(result));

            histo[result]++;
        }
        std::cout << histo.size() << " unique results for '" << test_case << "'"
                  << " in " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::high_resolution_clock::now()-start).count() << "ms\n";

        std::multimap<size_t, std::string, std::greater<size_t> > ranked;
        for (auto& entry : histo)
            ranked.emplace(entry.second, entry.first);

        int topN = 10;
        for (auto& rank : ranked)
        {
            std::cout << std::setw(8) << std::right << rank.first << ": " << rank.second << "\n";
            if (0 == --topN)
                break;
        }
    }
}

,

1 unique results for '----' in 186ms
 1000000: ----
3430 unique results for 'A--TG-DF----GR--' in 1156ms
    9251: A(----)--TG-DFGR--
    9226: (----)A--TG-DFGR--
    9191: A--T(----)G-DFGR--
    9150: A--TG-DFGR-(----)-
    9132: A--(----)TG-DFGR--
    9128: A--TG(----)-DFGR--
    9109: A--TG-D(----)FGR--
    9098: A--TG-DFG(----)R--
    9079: A--TG-DFGR(----)--
    9047: A-(----)-TG-DFGR--
1 unique results for '' in 25ms
 1000000: 
1 unique results for 'ATGDFGR' in 77ms
 1000000: ATGDFGR
+4

, , , .

( , ):

#include <cstddef>
#include <iostream>
#include <random>
#include <stdexcept>
#include <string>
#include <vector>

, . std::string::find_first_of std::string::find_first_not_of, . , , cbegin(). , .

std::vector<std::string::const_iterator>
find_conscutive_sequences(const std::string& text, const char c)
{
  std::vector<std::string::const_iterator> positions {};
  std::size_t idx = 0UL;
  while (idx != std::string::npos && idx < text.length())
    {
      const auto first = text.find_first_of(c, idx);
      if (first == std::string::npos)
        break;
      positions.push_back(text.cbegin() + first);
      idx = text.find_first_not_of(c, first);
    }
  return positions;
}

, .

, . , ++ 11, , rand.

positions, , , .

std::string::const_iterator
get_random_consecutive_sequence(const std::vector<std::string::const_iterator>& positions,
                                std::default_random_engine& prng)
{
  if (positions.empty())
    throw std::invalid_argument {"string does not contain any sequence"};
  std::uniform_int_distribution<std::size_t> rnddist {0UL, positions.size() - 1UL};
  const auto idx = rnddist(prng);
  return positions.at(idx);
}

, , . // .

std::string
mark_sequence(const std::string& text,
              const std::string::const_iterator start)
{
  const auto c = *start;
  const std::size_t first = start - text.cbegin();
  std::size_t last = text.find_first_not_of(c, first);
  if (last == std::string::npos)
    last = text.length();
  std::string marked {};
  marked.reserve(text.length() + 2UL);
  marked += text.substr(0UL, first);
  marked += '(';
  marked += text.substr(first, last - first);
  marked += ')';
  marked += text.substr(last, text.length() - last);
  return marked;
}

.

int
main()
{
  const std::string example {"--A--B-CD----E-F---"};
  std::random_device rnddev {};
  std::default_random_engine rndengine {rnddev()};
  const auto positions = find_conscutive_sequences(example, '-');
  for (int i = 0; i < 10; ++i)
    {
      const auto pos = get_random_consecutive_sequence(positions, rndengine);
      std::cout << mark_sequence(example, pos) << std::endl;
    }
}

:

--A--B-CD(----)E-F---
--A--B(-)CD----E-F---
--A(--)B-CD----E-F---
--A(--)B-CD----E-F---
--A--B-CD(----)E-F---
--A--B-CD----E-F(---)
--A--B-CD----E-F(---)
(--)A--B-CD----E-F---
--A--B(-)CD----E-F---
(--)A--B-CD----E-F---
+1

string::find() : . , - s.find("-", rand() % L) , L ( + 1).

0

As I understand it, all dash blocks should have the same probability of choice. Therefore, we must first find the positions where all these blocks begin, and then select one of these positions in Random.

If I am allowed to use Smalltalk for pseudocode, then I will first find the indexes where each dash begins:

dashPositionsOn: aString
    | indexes i n |
    indexes := OrderedCollection new.
    i := 1.
    n := aString size.
    [i <= n] whileTrue: [| char |
        char := aString at: i.
        char = $-
            ifTrue: [
                indexes add: i.
                [
                    i := i + 1.
                    i <= n and: [
                        char := aString at: i.
                        char = $-]] whileTrue]
            ifFalse: [i := i + 1]].
    ^indexes

Now we can select one of these codes in order: indexes atRandom.

Note that there are (many) better ways to implement this algorithm in Smalltalk (as well as in other languages).

0
source

All Articles