Linux C下的正则表达式

<regex.h>不是标准的C语言库函数,目前只能在linux中使用。

相关结构体:

/* Type for byte offsets within the string. POSIX mandates this. */
typedef int regoff_t;

typedef struct
{
  regoff_t rm_so;   /* Byte offset from string's start to substring's start. */
  regoff_t rm_eo;   /* Byte offset from string's start to substring's end. */
} regmatch_t;

#ifndef RE_TRANSLATE_TYPE
# define __RE_TRANSLATE_TYPE unsigned char *
# ifdef __USE_GNU
# define RE_TRANSLATE_TYPE __RE_TRANSLATE_TYPE
# endif
#endif

#ifdef __USE_GNU
# define __REPB_PREFIX(name) name
#else
# define __REPB_PREFIX(name) __##name
#endif

struct re_pattern_buffer
{
/* Space that holds the compiled pattern. It is declared as
`unsigned char *' because its elements are sometimes used as
array indexes. */
  unsigned char *__REPB_PREFIX(buffer);

/* Number of bytes to which `buffer' points. */
  unsigned long int __REPB_PREFIX(allocated);

/* Number of bytes actually used in `buffer'. */
  unsigned long int __REPB_PREFIX(used);

/* Syntax setting with which the pattern was compiled. */
  reg_syntax_t __REPB_PREFIX(syntax);

/* Pointer to a fastmap, if any, otherwise zero. re_search uses the
fastmap, if there is one, to skip over impossible starting points
for matches. */
  char *__REPB_PREFIX(fastmap);

/* Either a translate table to apply to all characters before
comparing them, or zero for no translation. The translation is
applied to a pattern when it is compiled and to a string when it
is matched. */
  __RE_TRANSLATE_TYPE __REPB_PREFIX(translate);

/* Number of subexpressions found by the compiler. */
  size_t re_nsub;

/* Zero if this pattern cannot match the empty string, one else.
Well, in truth it's used only in `re_search_2', to see whether or
not we should use the fastmap, so we don't set this absolutely
perfectly; see `re_compile_fastmap' (the `duplicate' case). */
  unsigned __REPB_PREFIX(can_be_null) : 1;

/* If REGS_UNALLOCATED, allocate space in the `regs' structure
for `max (RE_NREGS, re_nsub + 1)' groups.
If REGS_REALLOCATE, reallocate space if necessary.
If REGS_FIXED, use what's there. */
  #ifdef __USE_GNU
  # define REGS_UNALLOCATED 0
  # define REGS_REALLOCATE 1
  # define REGS_FIXED 2
#endif
  unsigned __REPB_PREFIX(regs_allocated) : 2;

/* Set to zero when `regex_compile' compiles a pattern; set to one
by `re_compile_fastmap' if it updates the fastmap. */
  unsigned __REPB_PREFIX(fastmap_accurate) : 1;

/* If set, `re_match_2' does not return information about
subexpressions. */
  unsigned __REPB_PREFIX(no_sub) : 1;

/* If set, a beginning-of-line anchor doesn't match at the beginning
of the string. */
  unsigned __REPB_PREFIX(not_bol) : 1;

/* Similarly for an end-of-line anchor. */
  unsigned __REPB_PREFIX(not_eol) : 1;

/* If true, an anchor at a newline matches. */
  unsigned __REPB_PREFIX(newline_anchor) : 1;
};

typedef struct re_pattern_buffer regex_t;

 Linux C 使用reg 一般步骤:

编译  regcomp()
匹配  regexec()
释放  regfree()

相关API函数:

int regcomp(regex_t *preg, const char *pattern, int cflags);             //编译
int regexec(const regex_t *preg, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);   //匹配
size_t regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size);
void regfree(regex_t *preg);        //释放

regex regHead;

//编译

regcomp(&regHead, "(.?)xml", REG_EXTENDED);

static CHAR str[MAX_STR_LINE];

regmatch_t pmatch[2];

//匹配

regexec(&regHead, str, 2, pmatch, 0) == 0

regmatch_t 是一个结构体数据类型,在regex.h中定义:成员rm_so 存放匹配文本串在目标串中的开始位置,rm_eo 存放结束位置。

通常我们以数组的形式定义一组这样的结构。因为往往我们的正则表达式中还包含子正则表达式

str是目标文本串。

2代表数组pmatch的元素个数数组0单元存放主正则表达式位置后边的单元依次存放子正则表达式位置,子正则表达式就是用圆括号包起来的部分表达式。

pmatch[0].rm_so和pmatch[0].rm_eo代表主正则表达式的启止位置(从x的前一个字符  到  字符l的后一个字符 ),pmatch[1].rm_so和pmatch[1].rm_eo代表子正则表达式的启止位置(从x的前一个字符  到  字符x)。

 //清除

void regfree (regex_t *compiled)

当我们使用完编译好的正则表达式后,或者要重新编译其他正则表达式的时候,我们可以用这个函数清空compiled指向的regex_t结构体的内容,请记住,如果是重新编译的话,一定要先清空regex_t结构体。

/*
 * return zero if the regular expression matches; otherwise, it returns a nonzero value.
 * MSGDEF regular one preChar, otherwise regular afterStr in "".
 * pmatch[0].rm_so, pmatch[0].rm_eo represent all subStr's start and end[close&open rule] without first blank space.
 * pmatch[n].rm_so, pmatch[n].rm_eo represent one subStr's start and end[close&open rule] if n bigger than zero.
 */

 : 正则匹配的是满足条件的最后一个str ,在使用strstr实现的时候应注意,strstr是匹配第一个str

 参考文献:

1. C语言用regcomp、regexec、regfree和regerror函数实现正则表达式校验 

2. C语言正则表达式详解 regcomp() regexec() regfree()用法详解

 

posted @ 2021-03-25 14:23  Lunais  阅读(550)  评论(0编辑  收藏  举报