mardi 1 novembre 2016

How to hash very large substrings quicly without collisions?

I have an app which as part of it finds all palindrome substrings of the input string. The input string can be up to 100,000 in length so the substrings can be very large. For example one input to the app resulted in over 300,000 substring palindromes over 10,000 in length. The app later counts all palindromes for equality and counts the unique ones by a hash that uses the standard hash that is done in the function that finds the palindromes. The hashes are stored in a vector and later counted for uniqueness in the app. The problems with such input and outptut conditions is the hashing for the the very large substrings takes too long plus gets collisions in the hashes. So I was wondering if there is an algorithm (hash) that can quickly and uniquely hash a very large substring (preferably by index range for the substring for speed, but with accuracy for uniqueness). The hashing is done at the end of the function get_palins. The code is below.

#include <iostream>
#include <string>
#include <cstdlib>
#include <time.h>
#include <vector>
#include <algorithm>
#include <unordered_map>
#include <map>
#include <cstdio>
#include <cmath>
#include <ctgmath>

using namespace std;

#define MAX 100000
#define mod 1000000007

vector<long long> palins[MAX+5];

//  Finds all palindromes for the string
void  get_palins(string &s)
{
     int N = s.length();
     int i, j, k,   // iterators
     rp,        // length of 'palindrome radius'
     R[2][N+1]; // table for storing results (2 rows for odd- and even-length palindromes

     s = "@" + s + "#"; // insert 'guards' to iterate easily over s

     for(j = 0; j <= 1; j++)
     {
         R[j][0] = rp = 0; i = 1;

         while(i <= N)
         {
             while(s[i - rp - 1] == s[i + j + rp]) {  rp++;  }
             R[j][i] = rp;
             k = 1;
             while((R[j][i - k] != rp - k) && (k < rp))
             {
                 R[j][i + k] = min(R[j][i - k],rp - k);
                 k++;
             }
             rp = max(rp - k,0);
             i += k;
         }
     }

     s = s.substr(1,N); // remove 'guards'

     for(i = 1; i <= N; i++)
     {
        for(j = 0; j <= 1; j++)
            for(rp = R[j][i]; rp > 0; rp--)
            {
                int begin = i - rp - 1;
                int end_count = 2 * rp + j;
                int end = begin + end_count - 1;
                if (!(begin == 0  && end == N -1 ))
                {
                   string ss = s.substr(begin, end_count);
                   long long hsh = hash<string>{}(ss);
                   palins[begin].push_back(hsh);

                }
          }
     }
}
unordered_map<long long, int> palin_counts;
unordered_map<char, int> end_matches;

// Solve when at least 1 character in string is different
void solve_all_not_same(string &s)
{
    int n = s.length();
    long long count = 0;

    get_palins(s);

    long long palin_count = 0;

    // Gets all palindromes into unordered map
    for (int i = 0; i <= n; i++)
    {
        for (auto& it : palins[i])
        {
            if (palin_counts.find(it)  == palin_counts.end())
            {
                palin_counts.insert({it,1});
            }
            else
            {
                palin_counts[it]++;
            }
        }
    }

    // From total palindromes, get proper border count
    // minus end characters of substrings
    for ( auto it = palin_counts.begin(); it != palin_counts.end(); ++it )
    {
        int top = it->second - 1;

        palin_count += (top * (top + 1)) / 2;
        palin_count %= mod;
    }

    // Store string character counts in unordered map
    for (int i = 0; i <= n; i++)
    {
        char c = s[i];

        //long long hsh = hash<char>{}(c);

        if (end_matches[c] == 0)
            end_matches[c] = 1;
        else
            end_matches[c]++;

    }

    // From substring end character matches, get proper border count
    // for end characters of substrings
    for ( auto it = end_matches.begin(); it != end_matches.end(); it++ )
    {
        int f = it->second - 1;
        count += (f * (f + 1)) / 2;
    }

    cout << (count + palin_count) % mod << endl;

    for (int i = 0; i < MAX+5; i++)
        palins[i].clear();
}

int main()
{

    string s; 
    cin >> s;

    solve_all_not_same(s);

    return 0;
}

Aucun commentaire:

Enregistrer un commentaire