有限自动机

有限自动机(Finite Automata)字符串匹配算法的步骤为

1.构造自动机

2.基于自动机匹配

该算法的最大难点在于自动机的构建,一旦构建完成,那么匹配将变得非常简单

自动机的构建的思想也是基于 最长前缀后缀,类似 KMP算法

假设我们要在 bacbababaabcbab 字符串里查找 abababca,那么我们只需要构建 模式abababca的自动机即可,状态从 0 开始

下面是简单的构建思想,lps为最长前缀后缀

     a  b  c  d
0    1  0  0  0        lps = 0  a
1    1  2  0  0        lps = 0  ab
2    3  0  0  0        lps = 1  aba(a)
3    1  4  0  0        lps = 2  abab(ab)
4    5  0  0  0        lps = 3  ababa(aba)
5    1  6  0  0        lps = 4  ababab(abab)
6    5  0  6  0        lps = 0  abababc
7    8  0  0  0        lps = 1  abababca(a)
8    1  0  0  0



linux c例子

#include <stdio.h> 
#include <string.h> 
#define NO_OF_CHARS 256 
  

void computeTransFun(char* pat, int M, int TF[][NO_OF_CHARS]) 
{ 
    int i, lps = 0, x; 
  
    //初始状态
    for (x = 0; x < NO_OF_CHARS; x++) 
        TF[0][x] = 0; 
    TF[0][pat[0]] = 1; 
    
    //构建表
    for (i = 1; i <= M; i++) { 
        
        //这里先拷贝最长前缀后缀可能的跳转
        //这样匹配失败时,按照最长前缀后缀可以无需从状态0开始匹配
        for (x = 0; x < NO_OF_CHARS; x++) 
            TF[i][x] = TF[lps][x]; 
  
        // 当状态为 i 时遇到 pat[i] 字符时进入的状态
        TF[i][pat[i]] = i + 1; 
  
        // 更新最长前缀后缀
        if (i < M) 
            lps = TF[lps][pat[i]]; 
    }
} 

void search(char* pat, char* txt) 
{
    int M = strlen(pat);
    int N = strlen(txt);
  
    int TF[M + 1][NO_OF_CHARS];
    
    //构建自动机
    computeTransFun(pat, M, TF);
  
    //基于自动机匹配
    int i, j = 0; 
    for (i = 0; i < N; i++) {
        printf("%d -%c- ", j, txt[i]);

        j = TF[j][txt[i]];
        if (j == M) {
            printf("%d acc\n", j);
            printf("\npattern found at index %d\n", i - M + 1);
            j = 0;  //匹配成功,状态重置
        } else
            printf("%d\n", j);
    } 
}

int main()
{
    char* txt = "bacbababaabcabababcam";
    char* pat = "abababca";
    search(pat, txt);
    return 0;
}
[root@izbp1irxwqt7ei21awv6wvz ccc]# ./a.out 
0 -b- 0
0 -a- 1
1 -c- 0
0 -b- 0
0 -a- 1
1 -b- 2
2 -a- 3
3 -b- 4
4 -a- 5
5 -a- 1
1 -b- 2
2 -c- 0
0 -a- 1
1 -b- 2
2 -a- 3
3 -b- 4
4 -a- 5
5 -b- 6
6 -c- 7
7 -a- 8 acc

pattern found at index 12
0 -1- 0
0 -1- 0
0 -a- 1
1 -b- 2
2 -a- 3
3 -b- 4
4 -a- 5
5 -b- 6
6 -c- 7
7 -a- 8 acc

pattern found at index 22


上一篇: Sunday算法
下一篇: 位算法-查找只出现一次的元素
作者邮箱: 203328517@qq.com