开发者

coupling by using dp

开发者 https://www.devze.com 2023-04-11 03:58 出处:网络
Given two strings, S1 & S2. given scoring scheme where gap penalty, mismatch score and match score.

Given two strings, S1 & S2. given scoring scheme where gap penalty, mismatch score and match score.

Find the S1 which have a best match with S2.

My idea is to list a开发者_开发技巧ll possible S1 and then match one by one with S2. List all possible S1 by using brute force. Then match each possible S1 with S2 by using dp.

Is there is any faster way to do so? or suggest any reference?


Using Wikipedia and a little bit of thinking one could code up something like this:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#ifndef MAX
#define MAX(a,b) ((a)>(b)?(a):(b))
#endif

#define MATCH_SCORE     2
#define MISMATCH_SCORE  -1
#define GAP_SCORE       0

// Calculates the match score recursively.
long MatchScore(const char* p/* in: may contain 'x' */,
                const char* s/* in: doesn't contain 'x' */)
{
  long res;

  if (*p && *s)
  {
    if ((*p == *s) ||
        ((*p == 'x') && (*s >= 'a') && (*s <= 'f')))
    {
      res = MatchScore(p + 1, s + 1) + MATCH_SCORE;
    }
    else
    {
      long s1 = MatchScore(p + 1, s + 1) + MISMATCH_SCORE;
      long s2 = MatchScore(p, s + 1) + GAP_SCORE;
      long s3 = MatchScore(p + 1, s) + GAP_SCORE;
      res = MAX(s1, MAX(s2, s3));
    }
  }
  else
  {
    res = GAP_SCORE * (long)(strlen(p) + strlen(s));
  }

  return res;
}

// Calculates the best matching string and the match score
// using dynamic programming, the score must be the same
// as returned by MatchScore().
void FindBestMatch(const char* p/* in: may contain 'x' */,
                   const char* s/* in: doesn't contain 'x' */,
                   long* score/* out: match score */,
                   char** match/* out: best matching string */)
{
  size_t lp = strlen(p) + 1;
  size_t ls = strlen(s) + 1;
  size_t i, j;
  long* table = (long*)malloc(lp * ls * sizeof(long));

  for (i = 0; i < lp; i++)
    table[0 * lp + i] = GAP_SCORE * i;

  for (j = 0; j < ls; j++)
    table[j * lp + 0] = GAP_SCORE * j;

  for (j = 1; j < ls; j++)
  {
    for (i = 1; i < lp; i++)
    {
      if ((p[i-1] == s[j-1]) ||
          ((p[i-1] == 'x') && (s[j-1] >= 'a') && (s[j-1] <= 'f')))
      {
        table[j * lp + i] = table[(j-1) * lp + (i-1)] + MATCH_SCORE;
      }
      else
      {
        table[j * lp + i] =
          MAX(table[(j-1) * lp + (i-1)] + MISMATCH_SCORE,
              MAX(table[(j-1) * lp + i] + GAP_SCORE,
                  table[j * lp + (i-1)] + GAP_SCORE));
      }
    }
  }

  *score = table[lp * ls - 1];

  // Now, trace back the score table and construct the best matching string

  *match = (char*)malloc(lp);
  (*match)[lp - 1] = '\0';

  for (j = ls, i = lp; j || i;)
  {
    if ((p[i-1] == s[j-1]) ||
        ((p[i-1] == 'x') && (s[j-1] >= 'a') && (s[j-1] <= 'f')))
    {
      (*match)[i-1] = s[j-1];
      j--;
      i--;
    }
    else
    {
      if (table[(j-1) * lp + i] > table[j * lp + (i-1)])
      {
        j--;
      }
      else
      {
        (*match)[i-1] = p[i-1];
        i--;
      }
    }
  }

  free(table);
}

int main(void)
{
  const char* pattern = "acdxdcxecxf";
  const char* str = "abdfdaaed";
  long score;
  char* match;
  char* match2;

  printf("pattern=\"%s\" str=\"%s\"\n", pattern, str);
  FindBestMatch(pattern, str, &score, &match);
  printf("score=%ld (recursive)\n", MatchScore(pattern, str));
  printf("score=%ld best match=\"%s\"\n", score, match);

  // Now repeat with the best match we've just found,
  // the result must be the same
  printf("\nRepeating with pattern=best match:\n\n");

  printf("pattern=\"%s\" str=\"%s\"\n", match, str);
  FindBestMatch(match, str, &score, &match2);
  printf("score=%ld (recursive)\n", MatchScore(match, str));
  printf("score=%ld best match=\"%s\"\n", score, match2);

  free(match);
  free(match2);
  return 0;
}

Output:

pattern="acdxdcxecxf" str="abdfdaaed"
score=14 (recursive)
score=14 best match="acdfdcaecdf"

Repeating with pattern=best match:

pattern="acdfdcaecdf" str="abdfdaaed"
score=14 (recursive)
score=14 best match="acdfdcaecdf"

I wonder if there're any bugs (other than the apparent lack of error checking).

0

精彩评论

暂无评论...
验证码 换一张
取 消

关注公众号