mirror of
git://git.savannah.nongnu.org/eliot.git
synced 2025-01-13 20:03:23 +01:00
- Added several tests for the regular expressions engine
- New regexp parser using Boost.Spirit. Lex and yacc are now gone. The main advantage of this new parser, apart from being purely C++, is that it can handle wide characters. Currently, the new parser does the same as the previous one, but the code is not yet ready to use regular expressions with non-ASCII dictionaries.
This commit is contained in:
parent
597673e8e5
commit
11adaba410
14 changed files with 717 additions and 810 deletions
11
configure.in
11
configure.in
|
@ -23,17 +23,6 @@ AC_PROG_MAKE_SET
|
||||||
AC_PROG_RANLIB
|
AC_PROG_RANLIB
|
||||||
PKG_PROG_PKG_CONFIG
|
PKG_PROG_PKG_CONFIG
|
||||||
|
|
||||||
AC_PROG_YACC
|
|
||||||
if test "$YACC" = yacc ; then
|
|
||||||
AC_MSG_ERROR([Could not find the 'bison' program on your system])
|
|
||||||
fi
|
|
||||||
|
|
||||||
dnl Better than AC_PROG_LEX
|
|
||||||
AM_PROG_LEX
|
|
||||||
if test "$LEX" != "flex" ; then
|
|
||||||
AC_MSG_ERROR([Could not find the 'flex' program on your system])
|
|
||||||
fi
|
|
||||||
|
|
||||||
dnl --------------------------------------------------------------
|
dnl --------------------------------------------------------------
|
||||||
dnl Checks for compilation flags
|
dnl Checks for compilation flags
|
||||||
dnl --------------------------------------------------------------
|
dnl --------------------------------------------------------------
|
||||||
|
|
|
@ -26,8 +26,6 @@ libdic_a_CFLAGS=
|
||||||
libdic_a_YFLAGS=-d
|
libdic_a_YFLAGS=-d
|
||||||
libdic_a_LFLAGS=
|
libdic_a_LFLAGS=
|
||||||
libdic_a_SOURCES = \
|
libdic_a_SOURCES = \
|
||||||
erl.lpp \
|
|
||||||
ery.ypp \
|
|
||||||
dic_exception.cpp dic_exception.h \
|
dic_exception.cpp dic_exception.h \
|
||||||
header.cpp header.h \
|
header.cpp header.h \
|
||||||
dic_internals.h \
|
dic_internals.h \
|
||||||
|
@ -36,44 +34,8 @@ libdic_a_SOURCES = \
|
||||||
dic_search.cpp \
|
dic_search.cpp \
|
||||||
encoding.cpp encoding.h \
|
encoding.cpp encoding.h \
|
||||||
automaton.cpp automaton.h \
|
automaton.cpp automaton.h \
|
||||||
regexp.cpp regexp.h
|
regexp.cpp regexp.h \
|
||||||
|
grammar.cpp grammar.h
|
||||||
BUILT_SOURCES= \
|
|
||||||
libdic_a-erl.cpp \
|
|
||||||
libdic_a-erl.h \
|
|
||||||
libdic_a-ery.cpp \
|
|
||||||
libdic_a-ery.h
|
|
||||||
|
|
||||||
|
|
||||||
nodist_libdic_a_SOURCES= \
|
|
||||||
libdic_a-erl.cpp \
|
|
||||||
libdic_a-erl.h \
|
|
||||||
libdic_a-ery.cpp \
|
|
||||||
libdic_a-ery.h
|
|
||||||
|
|
||||||
# This hook triggers on 'make dist' (and 'make distcheck')
|
|
||||||
# XXX: In fact, the recommended behaviour is:
|
|
||||||
# - list only libdic_a-ery.h in BUILT_SOURCES,
|
|
||||||
# - do not die with an error in configure.in if flex or bison is not found
|
|
||||||
# - do not have any dist-hook trigger
|
|
||||||
# The result is that the generated files are kept in the tarball generated with make dist,
|
|
||||||
# with still an error message for developers when the ypp or lpp file has been modified
|
|
||||||
# and bison or flex is not found.
|
|
||||||
# The problem is that, even though Automake is aware of the header generated by bison,
|
|
||||||
# it seems to have problems with the one generated by flex...
|
|
||||||
dist-hook:
|
|
||||||
-for file in $(BUILT_SOURCES) ; do rm -f $(distdir)/$$file ; done
|
|
||||||
|
|
||||||
CLEANFILES= \
|
|
||||||
libdic_a-erl.cpp \
|
|
||||||
libdic_a-erl.h \
|
|
||||||
libdic_a-ery.cpp \
|
|
||||||
libdic_a-ery.h
|
|
||||||
|
|
||||||
|
|
||||||
## automake workaround to generate .h file
|
|
||||||
libdic_a-erl.h: erl.lpp
|
|
||||||
${LEX} ${srcdir}/erl.lpp
|
|
||||||
|
|
||||||
#####################################
|
#####################################
|
||||||
if BUILD_DICTOOLS
|
if BUILD_DICTOOLS
|
||||||
|
|
21
dic/dic.h
21
dic/dic.h
|
@ -100,7 +100,7 @@ public:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the character code associated with an element,
|
* Returns the character code associated with an element,
|
||||||
* codes may range from 0 to 31. 0 is the null character.
|
* codes may range from 0 to 63. 0 is the null character.
|
||||||
* @returns code for the encoded character
|
* @returns code for the encoded character
|
||||||
*/
|
*/
|
||||||
const dic_code_t getCode(const dic_elt_t &elt) const;
|
const dic_code_t getCode(const dic_elt_t &elt) const;
|
||||||
|
@ -114,14 +114,14 @@ public:
|
||||||
/**
|
/**
|
||||||
* Returns a boolean to show if there is another available
|
* Returns a boolean to show if there is another available
|
||||||
* character in the current depth (a neighbor in the tree)
|
* character in the current depth (a neighbor in the tree)
|
||||||
* @returns 0 or 1 (true)
|
* @return true if the character is the last one at the current depth
|
||||||
*/
|
*/
|
||||||
bool isLast(const dic_elt_t &elt) const;
|
bool isLast(const dic_elt_t &elt) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a boolean to show if we are at the end of a word
|
* Returns a boolean to show if we are at the end of a word
|
||||||
* (see getNext)
|
* (see getNext())
|
||||||
* @returns 0 or 1 (true)
|
* @return true if this is the end of a word
|
||||||
*/
|
*/
|
||||||
bool isEndOfWord(const dic_elt_t &elt) const;
|
bool isEndOfWord(const dic_elt_t &elt) const;
|
||||||
|
|
||||||
|
@ -132,7 +132,7 @@ public:
|
||||||
const dic_elt_t getRoot() const;
|
const dic_elt_t getRoot() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next available neighbor (see getLast)
|
* Returns the next available neighbor (see isLast())
|
||||||
* @returns next dictionary element at the same depth
|
* @returns next dictionary element at the same depth
|
||||||
*/
|
*/
|
||||||
const dic_elt_t getNext(const dic_elt_t &elt) const;
|
const dic_elt_t getNext(const dic_elt_t &elt) const;
|
||||||
|
@ -292,21 +292,12 @@ private:
|
||||||
void searchWordByLen(struct params_7plus1_t *params,
|
void searchWordByLen(struct params_7plus1_t *params,
|
||||||
int i, const DAWG_EDGE *edgeptr) const;
|
int i, const DAWG_EDGE *edgeptr) const;
|
||||||
|
|
||||||
/**
|
|
||||||
* Internal version of searchRegExp, needed until
|
|
||||||
* wide chars are supported by our regexp engine.
|
|
||||||
*/
|
|
||||||
void searchRegExpInner(const string &iRegexp,
|
|
||||||
vector<string> &oWordList,
|
|
||||||
struct search_RegE_list_t *iList,
|
|
||||||
unsigned int iMaxResults) const;
|
|
||||||
|
|
||||||
/// Helper for searchRegExp()
|
/// Helper for searchRegExp()
|
||||||
template <typename DAWG_EDGE>
|
template <typename DAWG_EDGE>
|
||||||
void searchRegexpRecTempl(struct params_regexp_t *params,
|
void searchRegexpRecTempl(struct params_regexp_t *params,
|
||||||
int state,
|
int state,
|
||||||
const DAWG_EDGE *edgeptr,
|
const DAWG_EDGE *edgeptr,
|
||||||
vector<string> &oWordList,
|
vector<wstring> &oWordList,
|
||||||
unsigned int iMaxResults) const;
|
unsigned int iMaxResults) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
/*****************************************************************************
|
/*****************************************************************************
|
||||||
* Eliot
|
* Eliot
|
||||||
* Copyright (C) 2002-2007 Antoine Fraboulet
|
* Copyright (C) 2002-2008 Antoine Fraboulet & Olivier Teulière
|
||||||
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
||||||
|
* Olivier Teulière <ipkiss @@ gmail.com>
|
||||||
*
|
*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License as published by
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
@ -18,13 +19,6 @@
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
/**
|
|
||||||
* \file dic_search.c
|
|
||||||
* \brief Dictionary lookup functions
|
|
||||||
* \author Antoine Fraboulet
|
|
||||||
* \date 2002
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cwchar>
|
#include <cwchar>
|
||||||
|
@ -35,22 +29,13 @@
|
||||||
#include "header.h"
|
#include "header.h"
|
||||||
#include "encoding.h"
|
#include "encoding.h"
|
||||||
#include "regexp.h"
|
#include "regexp.h"
|
||||||
#include "libdic_a-ery.h" /* generated by bison */
|
|
||||||
#include "libdic_a-erl.h" /* generated by flex */
|
|
||||||
#include "automaton.h"
|
#include "automaton.h"
|
||||||
|
#include "grammar.h"
|
||||||
|
|
||||||
|
|
||||||
static const unsigned int DEFAULT_VECT_ALLOC = 100;
|
static const unsigned int DEFAULT_VECT_ALLOC = 100;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Function prototype for bison generated parser
|
|
||||||
*/
|
|
||||||
int regexpparse(yyscan_t scanner, NODE** root,
|
|
||||||
struct search_RegE_list_t *iList,
|
|
||||||
struct regexp_error_report_t *err);
|
|
||||||
|
|
||||||
|
|
||||||
template <typename DAWG_EDGE>
|
template <typename DAWG_EDGE>
|
||||||
const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE *eptr) const
|
const DAWG_EDGE* Dictionary::seekEdgePtr(const wchar_t* s, const DAWG_EDGE *eptr) const
|
||||||
{
|
{
|
||||||
|
@ -469,7 +454,7 @@ struct params_regexp_t
|
||||||
int maxlength;
|
int maxlength;
|
||||||
Automaton *automaton_field;
|
Automaton *automaton_field;
|
||||||
struct search_RegE_list_t *charlist;
|
struct search_RegE_list_t *charlist;
|
||||||
char word[DIC_WORD_MAX];
|
wchar_t word[DIC_WORD_MAX];
|
||||||
int wordlen;
|
int wordlen;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -478,7 +463,7 @@ template <typename DAWG_EDGE>
|
||||||
void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
|
void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
|
||||||
int state,
|
int state,
|
||||||
const DAWG_EDGE *edgeptr,
|
const DAWG_EDGE *edgeptr,
|
||||||
vector<string> &oWordList,
|
vector<wstring> &oWordList,
|
||||||
unsigned int iMaxResults) const
|
unsigned int iMaxResults) const
|
||||||
{
|
{
|
||||||
if (iMaxResults && oWordList.size() >= iMaxResults)
|
if (iMaxResults && oWordList.size() >= iMaxResults)
|
||||||
|
@ -488,7 +473,7 @@ void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
|
||||||
/* if we have a valid word we store it */
|
/* if we have a valid word we store it */
|
||||||
if (params->automaton_field->accept(state) && edgeptr->term)
|
if (params->automaton_field->accept(state) && edgeptr->term)
|
||||||
{
|
{
|
||||||
int l = strlen(params->word);
|
int l = wcslen(params->word);
|
||||||
if (params->minlength <= l &&
|
if (params->minlength <= l &&
|
||||||
params->maxlength >= l)
|
params->maxlength >= l)
|
||||||
{
|
{
|
||||||
|
@ -504,98 +489,16 @@ void Dictionary::searchRegexpRecTempl(struct params_regexp_t *params,
|
||||||
/* 1: the letter appears in the automaton as is */
|
/* 1: the letter appears in the automaton as is */
|
||||||
if (next_state)
|
if (next_state)
|
||||||
{
|
{
|
||||||
params->word[params->wordlen] = current->chr + 'a' - 1;
|
params->word[params->wordlen] = current->chr + L'a' - 1;
|
||||||
params->wordlen ++;
|
params->wordlen ++;
|
||||||
searchRegexpRecTempl(params, next_state, current, oWordList, iMaxResults);
|
searchRegexpRecTempl(params, next_state, current, oWordList, iMaxResults);
|
||||||
params->wordlen --;
|
params->wordlen --;
|
||||||
params->word[params->wordlen] = '\0';
|
params->word[params->wordlen] = L'\0';
|
||||||
}
|
}
|
||||||
} while (!(*current++).last);
|
} while (!(*current++).last);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void Dictionary::searchRegExpInner(const string &iRegexp,
|
|
||||||
vector<string> &oWordList,
|
|
||||||
struct search_RegE_list_t *iList,
|
|
||||||
unsigned int iMaxResults) const
|
|
||||||
{
|
|
||||||
// Allocate room for all the results
|
|
||||||
if (iMaxResults)
|
|
||||||
oWordList.reserve(iMaxResults);
|
|
||||||
else
|
|
||||||
oWordList.reserve(DEFAULT_VECT_ALLOC);
|
|
||||||
|
|
||||||
int ptl[REGEXP_MAX+1];
|
|
||||||
int PS [REGEXP_MAX+1];
|
|
||||||
|
|
||||||
/* (expr)# */
|
|
||||||
char stringbuf[250];
|
|
||||||
sprintf(stringbuf, "(%s)#", iRegexp.c_str());
|
|
||||||
for (int i = 0; i < REGEXP_MAX; i++)
|
|
||||||
{
|
|
||||||
PS[i] = 0;
|
|
||||||
ptl[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct regexp_error_report_t report;
|
|
||||||
report.pos1 = 0;
|
|
||||||
report.pos2 = 0;
|
|
||||||
report.msg[0] = '\0';
|
|
||||||
|
|
||||||
/* parsing */
|
|
||||||
yyscan_t scanner;
|
|
||||||
regexplex_init( &scanner );
|
|
||||||
YY_BUFFER_STATE buf = regexp_scan_string(stringbuf, scanner);
|
|
||||||
NODE *root = NULL;
|
|
||||||
int value = regexpparse(scanner , &root, iList, &report);
|
|
||||||
regexp_delete_buffer(buf, scanner);
|
|
||||||
regexplex_destroy(scanner);
|
|
||||||
|
|
||||||
if (value)
|
|
||||||
{
|
|
||||||
#ifdef DEBUG_FLEX_IS_BROKEN
|
|
||||||
fprintf(stderr, "parser error at pos %d - %d: %s\n",
|
|
||||||
report.pos1, report.pos2, report.msg);
|
|
||||||
#endif
|
|
||||||
regexp_delete_tree(root);
|
|
||||||
return ;
|
|
||||||
}
|
|
||||||
|
|
||||||
int n = 1;
|
|
||||||
int p = 1;
|
|
||||||
regexp_parcours(root, &p, &n, ptl);
|
|
||||||
PS [0] = p - 1;
|
|
||||||
ptl[0] = p - 1;
|
|
||||||
|
|
||||||
regexp_possuivante(root, PS);
|
|
||||||
|
|
||||||
Automaton *a = new Automaton(root->PP, ptl, PS, iList);
|
|
||||||
if (a)
|
|
||||||
{
|
|
||||||
struct params_regexp_t params;
|
|
||||||
params.minlength = iList->minlength;
|
|
||||||
params.maxlength = iList->maxlength;
|
|
||||||
params.automaton_field = a;
|
|
||||||
params.charlist = iList;
|
|
||||||
memset(params.word, '\0', sizeof(params.word));
|
|
||||||
params.wordlen = 0;
|
|
||||||
if (getHeader().getVersion() == 0)
|
|
||||||
{
|
|
||||||
searchRegexpRecTempl(¶ms, a->getInitId(),
|
|
||||||
getEdgeAt<DicEdgeOld>(getRoot()), oWordList, iMaxResults);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
searchRegexpRecTempl(¶ms, a->getInitId(),
|
|
||||||
getEdgeAt<DicEdge>(getRoot()), oWordList, iMaxResults);
|
|
||||||
}
|
|
||||||
|
|
||||||
delete a;
|
|
||||||
}
|
|
||||||
regexp_delete_tree(root);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void Dictionary::searchRegExp(const wstring &iRegexp,
|
void Dictionary::searchRegExp(const wstring &iRegexp,
|
||||||
vector<wstring> &oWordList,
|
vector<wstring> &oWordList,
|
||||||
struct search_RegE_list_t *iList,
|
struct search_RegE_list_t *iList,
|
||||||
|
@ -610,14 +513,65 @@ void Dictionary::searchRegExp(const wstring &iRegexp,
|
||||||
else
|
else
|
||||||
oWordList.reserve(DEFAULT_VECT_ALLOC);
|
oWordList.reserve(DEFAULT_VECT_ALLOC);
|
||||||
|
|
||||||
vector<string> tmpWordList;
|
int ptl[REGEXP_MAX+1];
|
||||||
// Do the actual work
|
int PS [REGEXP_MAX+1];
|
||||||
searchRegExpInner(convertToMb(iRegexp), tmpWordList, iList, iMaxResults);
|
|
||||||
|
|
||||||
vector<string>::const_iterator it;
|
for (int i = 0; i < REGEXP_MAX; i++)
|
||||||
for (it = tmpWordList.begin(); it != tmpWordList.end(); it++)
|
|
||||||
{
|
{
|
||||||
oWordList.push_back(convertToWc(*it));
|
PS[i] = 0;
|
||||||
|
ptl[i] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct regexp_error_report_t report;
|
||||||
|
report.pos1 = 0;
|
||||||
|
report.pos2 = 0;
|
||||||
|
report.msg[0] = '\0';
|
||||||
|
|
||||||
|
/* parsing */
|
||||||
|
Node *root = NULL;
|
||||||
|
bool parsingOk = parseRegexp(*this, (iRegexp + L"#").c_str(), &root, iList);
|
||||||
|
|
||||||
|
if (!parsingOk)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr, "parser error at pos %d - %d: %s\n",
|
||||||
|
report.pos1, report.pos2, report.msg);
|
||||||
|
#endif
|
||||||
|
delete root;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n = 1;
|
||||||
|
int p = 1;
|
||||||
|
root->traverse(p, n, ptl);
|
||||||
|
PS [0] = p - 1;
|
||||||
|
ptl[0] = p - 1;
|
||||||
|
|
||||||
|
root->nextPos(PS);
|
||||||
|
|
||||||
|
Automaton *a = new Automaton(root->getFirstPos(), ptl, PS, iList);
|
||||||
|
if (a)
|
||||||
|
{
|
||||||
|
struct params_regexp_t params;
|
||||||
|
params.minlength = iList->minlength;
|
||||||
|
params.maxlength = iList->maxlength;
|
||||||
|
params.automaton_field = a;
|
||||||
|
params.charlist = iList;
|
||||||
|
memset(params.word, L'\0', sizeof(params.word));
|
||||||
|
params.wordlen = 0;
|
||||||
|
if (getHeader().getVersion() == 0)
|
||||||
|
{
|
||||||
|
searchRegexpRecTempl(¶ms, a->getInitId(),
|
||||||
|
getEdgeAt<DicEdgeOld>(getRoot()), oWordList, iMaxResults);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
searchRegexpRecTempl(¶ms, a->getInitId(),
|
||||||
|
getEdgeAt<DicEdge>(getRoot()), oWordList, iMaxResults);
|
||||||
|
}
|
||||||
|
|
||||||
|
delete a;
|
||||||
|
}
|
||||||
|
delete root;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
59
dic/erl.lpp
59
dic/erl.lpp
|
@ -1,59 +0,0 @@
|
||||||
%{
|
|
||||||
/*****************************************************************************
|
|
||||||
* Eliot
|
|
||||||
* Copyright (C) 2005-2007 Antoine Fraboulet
|
|
||||||
* Authors: Antoine Fraboulet
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*****************************************************************************/
|
|
||||||
|
|
||||||
#include "dic.h"
|
|
||||||
#include "regexp.h"
|
|
||||||
#include "libdic_a-ery.h"
|
|
||||||
|
|
||||||
#define MASK_TO_REMOVE 0x1F
|
|
||||||
|
|
||||||
%}
|
|
||||||
%option prefix="regexp"
|
|
||||||
%option outfile="lex.yy.c"
|
|
||||||
%option header-file="libdic_a-erl.h"
|
|
||||||
%option reentrant bison-bridge
|
|
||||||
%option bison-locations
|
|
||||||
%option noyywrap nounput
|
|
||||||
|
|
||||||
/* TODO : remove lexer translation */
|
|
||||||
alphabet [a-zA-Z]
|
|
||||||
%%
|
|
||||||
|
|
||||||
{alphabet} {yylval_param->c=(yytext[0]&MASK_TO_REMOVE); return LEX_CHAR;}
|
|
||||||
"[" {return LEX_L_SQBRACKET;}
|
|
||||||
"]" {return LEX_R_SQBRACKET;}
|
|
||||||
"(" {return LEX_L_BRACKET;}
|
|
||||||
")" {return LEX_R_BRACKET;}
|
|
||||||
"^" {return LEX_HAT;}
|
|
||||||
|
|
||||||
"." {return LEX_ALL;}
|
|
||||||
":v:" {return LEX_VOWL;}
|
|
||||||
":c:" {return LEX_CONS;}
|
|
||||||
":1:" {return LEX_USER1;}
|
|
||||||
":2:" {return LEX_USER2;}
|
|
||||||
|
|
||||||
"?" {return LEX_QMARK;}
|
|
||||||
"+" {return LEX_PLUS;}
|
|
||||||
"*" {return LEX_STAR;}
|
|
||||||
|
|
||||||
"#" {return LEX_SHARP;}
|
|
||||||
%%
|
|
||||||
|
|
295
dic/ery.ypp
295
dic/ery.ypp
|
@ -1,295 +0,0 @@
|
||||||
%{
|
|
||||||
/*****************************************************************************
|
|
||||||
* Eliot
|
|
||||||
* Copyright (C) 2005-2007 Antoine Fraboulet
|
|
||||||
* Authors: Antoine Fraboulet
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*
|
|
||||||
* This program is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU General Public License for more details.
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU General Public License
|
|
||||||
* along with this program; if not, write to the Free Software
|
|
||||||
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
||||||
*****************************************************************************/
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <malloc.h>
|
|
||||||
|
|
||||||
#include "dic.h"
|
|
||||||
#include "regexp.h"
|
|
||||||
#include "libdic_a-ery.h"
|
|
||||||
#include "libdic_a-erl.h"
|
|
||||||
|
|
||||||
/* ************************************************** */
|
|
||||||
/* ************************************************** */
|
|
||||||
/* ************************************************** */
|
|
||||||
|
|
||||||
/**
|
|
||||||
* function prototype for parser generated by bison
|
|
||||||
*/
|
|
||||||
int regexpparse(yyscan_t scanner, NODE** root,
|
|
||||||
struct search_RegE_list_t *list,
|
|
||||||
struct regexp_error_report_t *err);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* function prototype for error reporting
|
|
||||||
*/
|
|
||||||
void regexperror(YYLTYPE *llocp, yyscan_t scanner, NODE** root,
|
|
||||||
struct search_RegE_list_t *list,
|
|
||||||
struct regexp_error_report_t *err,
|
|
||||||
char const *msg);
|
|
||||||
|
|
||||||
/* ************************************************** */
|
|
||||||
/* ************************************************** */
|
|
||||||
/* ************************************************** */
|
|
||||||
|
|
||||||
%}
|
|
||||||
%union {
|
|
||||||
char c;
|
|
||||||
NODE *NODE_TYPE;
|
|
||||||
char letters[DIC_LETTERS];
|
|
||||||
};
|
|
||||||
|
|
||||||
%defines
|
|
||||||
%name-prefix="regexp"
|
|
||||||
%pure-parser
|
|
||||||
%locations
|
|
||||||
%parse-param {yyscan_t yyscanner}
|
|
||||||
%parse-param {NODE **root}
|
|
||||||
%parse-param {struct search_RegE_list_t *list}
|
|
||||||
%parse-param {struct regexp_error_report_t *err}
|
|
||||||
%lex-param {yyscan_t yyscanner}
|
|
||||||
|
|
||||||
%token <c> LEX_CHAR
|
|
||||||
%token LEX_ALL
|
|
||||||
%token LEX_VOWL
|
|
||||||
%token LEX_CONS
|
|
||||||
%token LEX_USER1
|
|
||||||
%token LEX_USER2
|
|
||||||
|
|
||||||
%token LEX_L_SQBRACKET LEX_R_SQBRACKET
|
|
||||||
%token LEX_L_BRACKET LEX_R_BRACKET
|
|
||||||
%token LEX_HAT
|
|
||||||
|
|
||||||
%token LEX_QMARK
|
|
||||||
%token LEX_PLUS
|
|
||||||
%token LEX_STAR
|
|
||||||
%token LEX_SHARP
|
|
||||||
|
|
||||||
%type <NODE_TYPE> var
|
|
||||||
%type <NODE_TYPE> expr
|
|
||||||
%type <letters> vardis
|
|
||||||
%type <letters> exprdis
|
|
||||||
%type <NODE_TYPE> exprdisnode
|
|
||||||
%start start
|
|
||||||
%%
|
|
||||||
|
|
||||||
start: LEX_L_BRACKET expr LEX_R_BRACKET LEX_SHARP
|
|
||||||
{
|
|
||||||
NODE* sharp = regexp_createNODE(NODE_VAR,RE_FINAL_TOK,NULL,NULL);
|
|
||||||
*root = regexp_createNODE(NODE_AND,'\0',$2,sharp);
|
|
||||||
YYACCEPT;
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
expr : var
|
|
||||||
{
|
|
||||||
$$=$1;
|
|
||||||
}
|
|
||||||
| expr expr
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_AND,'\0',$1,$2);
|
|
||||||
}
|
|
||||||
| var LEX_QMARK
|
|
||||||
{
|
|
||||||
NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
|
|
||||||
$$=regexp_createNODE(NODE_OR,'\0',$1,epsilon);
|
|
||||||
}
|
|
||||||
| var LEX_PLUS
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_PLUS,'\0',$1,NULL);
|
|
||||||
}
|
|
||||||
| var LEX_STAR
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_STAR,'\0',$1,NULL);
|
|
||||||
}
|
|
||||||
/* () */
|
|
||||||
| LEX_L_BRACKET expr LEX_R_BRACKET
|
|
||||||
{
|
|
||||||
$$=$2;
|
|
||||||
}
|
|
||||||
| LEX_L_BRACKET expr LEX_R_BRACKET LEX_QMARK
|
|
||||||
{
|
|
||||||
NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
|
|
||||||
$$=regexp_createNODE(NODE_OR,'\0',$2,epsilon);
|
|
||||||
}
|
|
||||||
| LEX_L_BRACKET expr LEX_R_BRACKET LEX_PLUS
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_PLUS,'\0',$2,NULL);
|
|
||||||
}
|
|
||||||
| LEX_L_BRACKET expr LEX_R_BRACKET LEX_STAR
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_STAR,'\0',$2,NULL);
|
|
||||||
}
|
|
||||||
/* [] */
|
|
||||||
| LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET
|
|
||||||
{
|
|
||||||
$$=$2;
|
|
||||||
}
|
|
||||||
| LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_QMARK
|
|
||||||
{
|
|
||||||
NODE* epsilon=regexp_createNODE(NODE_VAR,RE_EPSILON,NULL,NULL);
|
|
||||||
$$=regexp_createNODE(NODE_OR,'\0',$2,epsilon);
|
|
||||||
}
|
|
||||||
| LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_PLUS
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_PLUS,'\0',$2,NULL);
|
|
||||||
}
|
|
||||||
| LEX_L_SQBRACKET exprdisnode LEX_R_SQBRACKET LEX_STAR
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_STAR,'\0',$2,NULL);
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
var : LEX_CHAR
|
|
||||||
{
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("var : lecture %c\n",$1 + 'a' -1);
|
|
||||||
#endif
|
|
||||||
$$=regexp_createNODE(NODE_VAR,$1,NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_ALL
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_VAR,RE_ALL_MATCH,NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_VOWL
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_VAR,RE_VOWL_MATCH,NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_CONS
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_VAR,RE_CONS_MATCH,NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_USER1
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_VAR,RE_USR1_MATCH,NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_USER2
|
|
||||||
{
|
|
||||||
$$=regexp_createNODE(NODE_VAR,RE_USR2_MATCH,NULL,NULL);
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
exprdisnode : exprdis
|
|
||||||
{
|
|
||||||
int i,j;
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("exprdisnode : exprdis : ");
|
|
||||||
#endif
|
|
||||||
for(i=RE_LIST_USER_END + 1; i < DIC_SEARCH_REGE_LIST; i++)
|
|
||||||
{
|
|
||||||
if (list->valid[i] == 0)
|
|
||||||
{
|
|
||||||
list->valid[i] = 1;
|
|
||||||
list->symbl[i] = RE_ALL_MATCH + i;
|
|
||||||
list->letters[i][0] = 0;
|
|
||||||
for(j=1; j < DIC_LETTERS; j++)
|
|
||||||
list->letters[i][j] = $1[j] ? 1 : 0;
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("list %d symbl x%02x : ",i,list->symbl[i]);
|
|
||||||
for(j=0; j < DIC_LETTERS; j++)
|
|
||||||
if (list->letters[i][j])
|
|
||||||
printf("%c",j+'a'-1);
|
|
||||||
printf("\n");
|
|
||||||
#endif
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$$=regexp_createNODE(NODE_VAR,list->symbl[i],NULL,NULL);
|
|
||||||
}
|
|
||||||
| LEX_HAT exprdis
|
|
||||||
{
|
|
||||||
int i,j;
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("exprdisnode : HAT exprdis : ");
|
|
||||||
#endif
|
|
||||||
for(i=RE_LIST_USER_END + 1; i < DIC_SEARCH_REGE_LIST; i++)
|
|
||||||
{
|
|
||||||
if (list->valid[i] == 0)
|
|
||||||
{
|
|
||||||
list->valid[i] = 1;
|
|
||||||
list->symbl[i] = RE_ALL_MATCH + i;
|
|
||||||
list->letters[i][0] = 0;
|
|
||||||
for(j=1; j < DIC_LETTERS; j++)
|
|
||||||
list->letters[i][j] = $2[j] ? 0 : 1;
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("list %d symbl x%02x : ",i,list->symbl[i]);
|
|
||||||
for(j=0; j < DIC_LETTERS; j++)
|
|
||||||
if (list->letters[i][j])
|
|
||||||
printf("%c",j+'a'-1);
|
|
||||||
printf("\n");
|
|
||||||
#endif
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
$$=regexp_createNODE(NODE_VAR,list->symbl[i],NULL,NULL);
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
exprdis: vardis
|
|
||||||
{
|
|
||||||
memcpy($$,$1,sizeof(char)*DIC_LETTERS);
|
|
||||||
}
|
|
||||||
| vardis exprdis
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
for(i=0; i < DIC_LETTERS; i++)
|
|
||||||
$$[i] = $1[i] | $2[i];
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
vardis: LEX_CHAR
|
|
||||||
{
|
|
||||||
int c = $1;
|
|
||||||
memset($$,0,sizeof(char)*DIC_LETTERS);
|
|
||||||
#ifdef DEBUG_RE_PARSE
|
|
||||||
printf("vardis : lecture %c\n",c + 'a' -1);
|
|
||||||
#endif
|
|
||||||
$$[c] = 1;
|
|
||||||
}
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
%%
|
|
||||||
|
|
||||||
#define UNUSED __attribute__((unused))
|
|
||||||
|
|
||||||
void regexperror(YYLTYPE *llocp, yyscan_t UNUSED yyscanner, NODE UNUSED **root,
|
|
||||||
struct search_RegE_list_t UNUSED *list,
|
|
||||||
struct regexp_error_report_t *err, char const *msg)
|
|
||||||
{
|
|
||||||
err->pos1 = llocp->first_column;
|
|
||||||
err->pos2 = llocp->last_column;
|
|
||||||
strncpy(err->msg,msg,sizeof(err->msg));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* shut down the compiler
|
|
||||||
*/
|
|
||||||
//int yy_init_globals (yyscan_t yyscanner);
|
|
340
dic/grammar.cpp
Normal file
340
dic/grammar.cpp
Normal file
|
@ -0,0 +1,340 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
* Eliot
|
||||||
|
* Copyright (C) 2008 Olivier Teulière
|
||||||
|
* Authors: Olivier Teulière <ipkiss @@ gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <stack>
|
||||||
|
#include <boost/spirit/core.hpp>
|
||||||
|
#include <boost/spirit/utility/chset.hpp>
|
||||||
|
#include <boost/spirit/tree/ast.hpp>
|
||||||
|
#ifdef DEBUG_RE
|
||||||
|
#include <boost/spirit/tree/tree_to_xml.hpp>
|
||||||
|
#include <map>
|
||||||
|
#include <iostream>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "dic.h"
|
||||||
|
#include "header.h"
|
||||||
|
#include "regexp.h"
|
||||||
|
|
||||||
|
using namespace boost::spirit;
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// TODO:
|
||||||
|
// - error handling
|
||||||
|
|
||||||
|
// A few typedefs to simplify things
|
||||||
|
typedef const wchar_t *iterator_t;
|
||||||
|
typedef tree_match<iterator_t> parse_tree_match_t;
|
||||||
|
typedef parse_tree_match_t::const_tree_iterator iter_t;
|
||||||
|
|
||||||
|
|
||||||
|
struct RegexpGrammar : grammar<RegexpGrammar>
|
||||||
|
{
|
||||||
|
static const int wrapperId = 0;
|
||||||
|
static const int exprId = 1;
|
||||||
|
static const int repeatId = 2;
|
||||||
|
static const int groupId = 3;
|
||||||
|
static const int varId = 4;
|
||||||
|
static const int choiceId = 5;
|
||||||
|
static const int alphavarId = 6;
|
||||||
|
|
||||||
|
RegexpGrammar(const wstring &letters)
|
||||||
|
{
|
||||||
|
wstring lower = letters;
|
||||||
|
std::transform(lower.begin(), lower.end(), lower.begin(), towlower);
|
||||||
|
m_allLetters = letters + lower;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename ScannerT>
|
||||||
|
struct definition
|
||||||
|
{
|
||||||
|
// Constructor
|
||||||
|
definition(const RegexpGrammar &self)
|
||||||
|
{
|
||||||
|
wrapper
|
||||||
|
= expr >> L"#"
|
||||||
|
;
|
||||||
|
|
||||||
|
expr
|
||||||
|
= repeat >> *expr;
|
||||||
|
;
|
||||||
|
|
||||||
|
repeat
|
||||||
|
= group >> root_node_d[ch_p(L'?')]
|
||||||
|
| group >> root_node_d[ch_p(L'*')]
|
||||||
|
| group >> root_node_d[ch_p(L'+')]
|
||||||
|
| group
|
||||||
|
;
|
||||||
|
|
||||||
|
group
|
||||||
|
= var
|
||||||
|
| root_node_d[str_p(L"[^")] >> choice >> no_node_d[ch_p(L']')]
|
||||||
|
| root_node_d[ch_p(L'[')] >> choice >> no_node_d[ch_p(L']')]
|
||||||
|
| root_node_d[ch_p(L'(')] >> +repeat >> no_node_d[ch_p(L')')] // XXX: 'expr' instead of '+repeat' doesn't work. Why?
|
||||||
|
;
|
||||||
|
|
||||||
|
var
|
||||||
|
= alphavar
|
||||||
|
| ch_p(L'.')
|
||||||
|
| str_p(L":v:")
|
||||||
|
| str_p(L":c:")
|
||||||
|
| str_p(L":1:")
|
||||||
|
| str_p(L":2:")
|
||||||
|
;
|
||||||
|
|
||||||
|
choice
|
||||||
|
= leaf_node_d[+alphavar]
|
||||||
|
;
|
||||||
|
|
||||||
|
alphavar
|
||||||
|
= chset<>(self.m_allLetters.c_str())
|
||||||
|
;
|
||||||
|
}
|
||||||
|
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<wrapperId> > wrapper;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<exprId> > expr;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<repeatId> > repeat;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<groupId> > group;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<varId> > var;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<choiceId> > choice;
|
||||||
|
rule<ScannerT, parser_context<>, parser_tag<alphavarId> > alphavar;
|
||||||
|
|
||||||
|
const rule<ScannerT, parser_context<>, parser_tag<wrapperId> > & start() const { return wrapper; }
|
||||||
|
};
|
||||||
|
|
||||||
|
wstring m_allLetters;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
void evaluate(const Header &iHeader, iter_t const& i, stack<Node*> &evalStack,
|
||||||
|
struct search_RegE_list_t *iList, bool negate = false)
|
||||||
|
{
|
||||||
|
if (i->value.id() == RegexpGrammar::alphavarId)
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 0);
|
||||||
|
|
||||||
|
// Extract the character and convert it to its internal code
|
||||||
|
uint8_t code = iHeader.getCodeFromChar(*i->value.begin());
|
||||||
|
Node *n = new Node(NODE_VAR, code, NULL, NULL);
|
||||||
|
evalStack.push(n);
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::choiceId)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
assert(i->children.size() == 0);
|
||||||
|
|
||||||
|
string choiceLetters(i->value.begin(), i->value.end());
|
||||||
|
int j;
|
||||||
|
for (j = RE_LIST_USER_END + 1; j < DIC_SEARCH_REGE_LIST; j++)
|
||||||
|
{
|
||||||
|
if (!iList->valid[j])
|
||||||
|
{
|
||||||
|
iList->valid[j] = true;
|
||||||
|
iList->symbl[j] = RE_ALL_MATCH + j;
|
||||||
|
iList->letters[j][0] = false;
|
||||||
|
for (int k = 1; k < DIC_LETTERS; k++)
|
||||||
|
{
|
||||||
|
bool contains = (choiceLetters.find(k + L'a' - 1) != string::npos);
|
||||||
|
iList->letters[j][k] = (contains ? !negate : negate);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Node *node = new Node(NODE_VAR, iList->symbl[j], NULL, NULL);
|
||||||
|
evalStack.push(node);
|
||||||
|
#endif
|
||||||
|
#if 1
|
||||||
|
assert(i->children.size() == 0);
|
||||||
|
|
||||||
|
wstring choiceLetters(i->value.begin(), i->value.end());
|
||||||
|
// Make sure the letters are in upper case
|
||||||
|
std::transform(choiceLetters.begin(), choiceLetters.end(),
|
||||||
|
choiceLetters.begin(), towupper);
|
||||||
|
// The dictionary letters are already in upper case
|
||||||
|
const wstring &letters = iHeader.getLetters();
|
||||||
|
wstring::const_iterator itLetter;
|
||||||
|
int j;
|
||||||
|
for (j = RE_LIST_USER_END + 1; j < DIC_SEARCH_REGE_LIST; ++j)
|
||||||
|
{
|
||||||
|
if (!iList->valid[j])
|
||||||
|
{
|
||||||
|
iList->valid[j] = true;
|
||||||
|
iList->symbl[j] = RE_ALL_MATCH + j;
|
||||||
|
iList->letters[j][0] = false;
|
||||||
|
for (itLetter = letters.begin(); itLetter != letters.end(); ++itLetter)
|
||||||
|
{
|
||||||
|
bool contains = (choiceLetters.find(*itLetter) != string::npos);
|
||||||
|
iList->letters[j][iHeader.getCodeFromChar(*itLetter)] =
|
||||||
|
(contains ? !negate : negate);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Node *node = new Node(NODE_VAR, iList->symbl[j], NULL, NULL);
|
||||||
|
evalStack.push(node);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::varId)
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 0);
|
||||||
|
|
||||||
|
string var(i->value.begin(), i->value.end());
|
||||||
|
Node *node = NULL;
|
||||||
|
if (var == ":v:")
|
||||||
|
node = new Node(NODE_VAR, RE_VOWL_MATCH, NULL, NULL);
|
||||||
|
else if (var == ":c:")
|
||||||
|
node = new Node(NODE_VAR, RE_CONS_MATCH, NULL, NULL);
|
||||||
|
else if (var == ":1:")
|
||||||
|
node = new Node(NODE_VAR, RE_USR1_MATCH, NULL, NULL);
|
||||||
|
else if (var == ":2:")
|
||||||
|
node = new Node(NODE_VAR, RE_USR2_MATCH, NULL, NULL);
|
||||||
|
else if (var == ".")
|
||||||
|
node = new Node(NODE_VAR, RE_ALL_MATCH, NULL, NULL);
|
||||||
|
else
|
||||||
|
assert(0);
|
||||||
|
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::groupId)
|
||||||
|
{
|
||||||
|
if (*i->value.begin() == L'(')
|
||||||
|
{
|
||||||
|
assert(i->children.size() != 0);
|
||||||
|
// Create a node for each child
|
||||||
|
iter_t iter;
|
||||||
|
for (iter = i->children.begin(); iter != i->children.end(); ++iter)
|
||||||
|
evaluate(iHeader, iter, evalStack, iList);
|
||||||
|
// "Concatenate" the created child nodes with AND nodes
|
||||||
|
for (uint j = 0; j < i->children.size() - 1; ++j)
|
||||||
|
{
|
||||||
|
Node *old2 = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *old1 = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *node = new Node(NODE_AND, '\0', old1, old2);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (*i->value.begin() == L'[')
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 1);
|
||||||
|
bool hasCaret = (i->value.begin() + 1 != i->value.end());
|
||||||
|
evaluate(iHeader, i->children.begin(), evalStack, iList, hasCaret);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::repeatId)
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 1);
|
||||||
|
evaluate(iHeader, i->children.begin(), evalStack, iList);
|
||||||
|
|
||||||
|
if (*i->value.begin() == L'*')
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 1);
|
||||||
|
Node *old = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *node = new Node(NODE_STAR, '\0', old, NULL);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else if (*i->value.begin() == L'+')
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 1);
|
||||||
|
Node *old = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *node = new Node(NODE_PLUS, '\0', old, NULL);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else if (*i->value.begin() == L'?')
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 1);
|
||||||
|
Node *old = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *epsilon = new Node(NODE_VAR, RE_EPSILON, NULL, NULL);
|
||||||
|
Node *node = new Node(NODE_OR, '\0', old, epsilon);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::exprId)
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 2);
|
||||||
|
evaluate(iHeader, i->children.begin(), evalStack, iList);
|
||||||
|
evaluate(iHeader, i->children.begin() + 1, evalStack, iList);
|
||||||
|
|
||||||
|
Node *old2 = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *old1 = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node *node = new Node(NODE_AND, '\0', old1, old2);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else if (i->value.id() == RegexpGrammar::wrapperId)
|
||||||
|
{
|
||||||
|
assert(i->children.size() == 2);
|
||||||
|
evaluate(iHeader, i->children.begin(), evalStack, iList);
|
||||||
|
Node *old = evalStack.top();
|
||||||
|
evalStack.pop();
|
||||||
|
Node* sharp = new Node(NODE_VAR, RE_FINAL_TOK, NULL, NULL);
|
||||||
|
Node *node = new Node(NODE_AND, '\0', old, sharp);
|
||||||
|
evalStack.push(node);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root, struct search_RegE_list_t *iList)
|
||||||
|
{
|
||||||
|
// Create a grammar object
|
||||||
|
RegexpGrammar g(iDic.getHeader().getLetters());
|
||||||
|
// Parse the input and generate an Abstract Syntax Tree (AST)
|
||||||
|
tree_parse_info<const wchar_t*> info = ast_parse(input, g);
|
||||||
|
|
||||||
|
if (info.full)
|
||||||
|
{
|
||||||
|
#ifdef DEBUG_RE
|
||||||
|
// Dump parse tree as XML
|
||||||
|
std::map<parser_id, std::string> rule_names;
|
||||||
|
rule_names[RegexpGrammar::wrapperId] = "wrapper";
|
||||||
|
rule_names[RegexpGrammar::exprId] = "expr";
|
||||||
|
rule_names[RegexpGrammar::repeatId] = "repeat";
|
||||||
|
rule_names[RegexpGrammar::groupId] = "group";
|
||||||
|
rule_names[RegexpGrammar::varId] = "var";
|
||||||
|
rule_names[RegexpGrammar::choiceId] = "choice";
|
||||||
|
rule_names[RegexpGrammar::alphavarId] = "alphavar";
|
||||||
|
tree_to_xml(cout, info.trees);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
stack<Node*> evalStack;
|
||||||
|
evaluate(iDic.getHeader(), info.trees.begin(), evalStack, iList);
|
||||||
|
assert(evalStack.size() == 1);
|
||||||
|
*root = evalStack.top();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
31
dic/grammar.h
Normal file
31
dic/grammar.h
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
* Eliot
|
||||||
|
* Copyright (C) 2008 Olivier Teulière
|
||||||
|
* Authors: Olivier Teulière <ipkiss @@ gmail.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, write to the Free Software
|
||||||
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#ifndef _GRAMMAR_H_
|
||||||
|
#define _GRAMMAR_H_
|
||||||
|
|
||||||
|
class Dictionary;
|
||||||
|
class Node;
|
||||||
|
struct search_RegE_list_t;
|
||||||
|
|
||||||
|
bool parseRegexp(const Dictionary &iDic, const wchar_t *input, Node **root, struct search_RegE_list_t *iList);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
230
dic/regexp.cpp
230
dic/regexp.cpp
|
@ -39,115 +39,80 @@
|
||||||
#include "regexp.h"
|
#include "regexp.h"
|
||||||
#include "automaton.h"
|
#include "automaton.h"
|
||||||
|
|
||||||
#ifndef PDBG
|
|
||||||
#ifdef DEBUG_RE2
|
|
||||||
#define PDBG(x) x
|
|
||||||
#else
|
|
||||||
#define PDBG(x)
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
Node::Node(int type, char v, Node *fg, Node *fd)
|
||||||
NODE* regexp_createNODE(int type, char v, NODE *fg, NODE *fd)
|
: m_type(type), m_var(v), m_fg(fg), m_fd(fd), m_number(0), m_position(0),
|
||||||
|
m_annulable(false), m_PP(0), m_DP(0)
|
||||||
{
|
{
|
||||||
NODE *x;
|
|
||||||
x=(NODE *)malloc(sizeof(NODE));
|
|
||||||
x->type = type;
|
|
||||||
x->var = v;
|
|
||||||
x->fd = fd;
|
|
||||||
x->fg = fg;
|
|
||||||
x->number = 0;
|
|
||||||
x->position = 0;
|
|
||||||
x->annulable = 0;
|
|
||||||
x->PP = 0;
|
|
||||||
x->DP = 0;
|
|
||||||
return x;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void regexp_delete_tree(NODE *root)
|
Node::~Node()
|
||||||
{
|
{
|
||||||
if (root == NULL)
|
delete m_fg;
|
||||||
return;
|
delete m_fd;
|
||||||
regexp_delete_tree(root->fg);
|
|
||||||
regexp_delete_tree(root->fd);
|
|
||||||
free(root);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
|
||||||
static void print_node(FILE*, NODE *n, int detail);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* computes position, annulable, PP, DP attributes
|
* p is the current leaf position
|
||||||
* @param r = root
|
* n is the current node number
|
||||||
* @param p = current leaf position
|
|
||||||
* @param n = current node number
|
|
||||||
* @param ptl = position to letter
|
|
||||||
*/
|
*/
|
||||||
|
void Node::traverse(int &p, int &n, int ptl[])
|
||||||
void regexp_parcours(NODE* r, int *p, int *n, int ptl[])
|
|
||||||
{
|
{
|
||||||
if (r == NULL)
|
if (m_fg)
|
||||||
return;
|
m_fg->traverse(p, n, ptl);
|
||||||
|
if (m_fd)
|
||||||
|
m_fd->traverse(p, n, ptl);
|
||||||
|
|
||||||
regexp_parcours(r->fg, p, n, ptl);
|
m_number = n;
|
||||||
regexp_parcours(r->fd, p, n, ptl);
|
++n;
|
||||||
|
|
||||||
switch (r->type)
|
switch (m_type)
|
||||||
{
|
{
|
||||||
case NODE_VAR:
|
case NODE_VAR:
|
||||||
r->position = *p;
|
m_position = p;
|
||||||
ptl[*p] = r->var;
|
ptl[p] = m_var;
|
||||||
*p = *p + 1;
|
++p;
|
||||||
r->annulable = 0;
|
m_annulable = false;
|
||||||
r->PP = 1 << (r->position - 1);
|
m_PP = 1 << (m_position - 1);
|
||||||
r->DP = 1 << (r->position - 1);
|
m_DP = 1 << (m_position - 1);
|
||||||
break;
|
break;
|
||||||
case NODE_OR:
|
case NODE_OR:
|
||||||
r->position = 0;
|
m_position = 0;
|
||||||
r->annulable = r->fg->annulable || r->fd->annulable;
|
m_annulable = m_fg->m_annulable || m_fd->m_annulable;
|
||||||
r->PP = r->fg->PP | r->fd->PP;
|
m_PP = m_fg->m_PP | m_fd->m_PP;
|
||||||
r->DP = r->fg->DP | r->fd->DP;
|
m_DP = m_fg->m_DP | m_fd->m_DP;
|
||||||
break;
|
break;
|
||||||
case NODE_AND:
|
case NODE_AND:
|
||||||
r->position = 0;
|
m_position = 0;
|
||||||
r->annulable = r->fg->annulable && r->fd->annulable;
|
m_annulable = m_fg->m_annulable && m_fd->m_annulable;
|
||||||
r->PP = (r->fg->annulable) ? (r->fg->PP | r->fd->PP) : r->fg->PP;
|
m_PP = (m_fg->m_annulable) ? (m_fg->m_PP | m_fd->m_PP) : m_fg->m_PP;
|
||||||
r->DP = (r->fd->annulable) ? (r->fg->DP | r->fd->DP) : r->fd->DP;
|
m_DP = (m_fd->m_annulable) ? (m_fg->m_DP | m_fd->m_DP) : m_fd->m_DP;
|
||||||
break;
|
break;
|
||||||
case NODE_PLUS:
|
case NODE_PLUS:
|
||||||
r->position = 0;
|
m_position = 0;
|
||||||
r->annulable = 0;
|
m_annulable = false;
|
||||||
r->PP = r->fg->PP;
|
m_PP = m_fg->m_PP;
|
||||||
r->DP = r->fg->DP;
|
m_DP = m_fg->m_DP;
|
||||||
break;
|
break;
|
||||||
case NODE_STAR:
|
case NODE_STAR:
|
||||||
r->position = 0;
|
m_position = 0;
|
||||||
r->annulable = 1;
|
m_annulable = true;
|
||||||
r->PP = r->fg->PP;
|
m_PP = m_fg->m_PP;
|
||||||
r->DP = r->fg->DP;
|
m_DP = m_fg->m_DP;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
r->number = *n;
|
|
||||||
*n = *n + 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* computes possuivante
|
|
||||||
* @param r = root
|
|
||||||
* @param PS = next position
|
|
||||||
*/
|
|
||||||
|
|
||||||
void regexp_possuivante(NODE* r, int PS[])
|
void Node::nextPos(int PS[])
|
||||||
{
|
{
|
||||||
if (r == NULL)
|
if (m_fg)
|
||||||
return;
|
m_fg->nextPos(PS);
|
||||||
|
if (m_fd)
|
||||||
|
m_fd->nextPos(PS);
|
||||||
|
|
||||||
regexp_possuivante(r->fg, PS);
|
switch (m_type)
|
||||||
regexp_possuivante(r->fd, PS);
|
|
||||||
|
|
||||||
switch (r->type)
|
|
||||||
{
|
{
|
||||||
case NODE_AND:
|
case NODE_AND:
|
||||||
/************************************/
|
/************************************/
|
||||||
|
@ -156,8 +121,8 @@ void regexp_possuivante(NODE* r, int PS[])
|
||||||
/************************************/
|
/************************************/
|
||||||
for (int pos = 1; pos <= PS[0]; pos++)
|
for (int pos = 1; pos <= PS[0]; pos++)
|
||||||
{
|
{
|
||||||
if (r->fg->DP & (1 << (pos-1)))
|
if (m_fg->m_DP & (1 << (pos-1)))
|
||||||
PS[pos] |= r->fd->PP;
|
PS[pos] |= m_fd->m_PP;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case NODE_PLUS:
|
case NODE_PLUS:
|
||||||
|
@ -168,8 +133,8 @@ void regexp_possuivante(NODE* r, int PS[])
|
||||||
/************************************/
|
/************************************/
|
||||||
for (int pos = 1; pos <= PS[0]; pos++)
|
for (int pos = 1; pos <= PS[0]; pos++)
|
||||||
{
|
{
|
||||||
if (r->DP & (1 << (pos-1)))
|
if (m_DP & (1 << (pos-1)))
|
||||||
PS[pos] |= r->PP;
|
PS[pos] |= m_PP;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case NODE_STAR:
|
case NODE_STAR:
|
||||||
|
@ -179,32 +144,27 @@ void regexp_possuivante(NODE* r, int PS[])
|
||||||
/************************************/
|
/************************************/
|
||||||
for (int pos = 1; pos <= PS[0]; pos++)
|
for (int pos = 1; pos <= PS[0]; pos++)
|
||||||
{
|
{
|
||||||
if (r->DP & (1 << (pos-1)))
|
if (m_DP & (1 << (pos-1)))
|
||||||
PS[pos] |= r->PP;
|
PS[pos] |= m_PP;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// DEBUG only fonctions
|
// DEBUG only fonctions
|
||||||
////////////////////////////////////////////////*/
|
////////////////////////////////////////////////
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
#ifdef DEBUG_RE
|
||||||
void regexp_print_PS(int PS[])
|
void printPS(int PS[])
|
||||||
{
|
{
|
||||||
printf("** positions suivantes **\n");
|
printf("** next positions **\n");
|
||||||
for (int i = 1; i <= PS[0]; i++)
|
for (int i = 1; i <= PS[0]; i++)
|
||||||
{
|
{
|
||||||
printf("%02d: 0x%08x\n", i, PS[i]);
|
printf("%02d: 0x%08x\n", i, PS[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
|
||||||
void regexp_print_ptl(int ptl[])
|
void regexp_print_ptl(int ptl[])
|
||||||
{
|
{
|
||||||
printf("** pos -> lettre: ");
|
printf("** pos -> lettre: ");
|
||||||
|
@ -216,8 +176,6 @@ void regexp_print_ptl(int ptl[])
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
void regexp_print_letter(FILE* f, char l)
|
void regexp_print_letter(FILE* f, char l)
|
||||||
{
|
{
|
||||||
|
@ -239,8 +197,6 @@ void regexp_print_letter(FILE* f, char l)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
void regexp_print_letter2(FILE* f, char l)
|
void regexp_print_letter2(FILE* f, char l)
|
||||||
{
|
{
|
||||||
|
@ -262,19 +218,14 @@ void regexp_print_letter2(FILE* f, char l)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
#ifdef DEBUG_RE
|
||||||
static void print_node(FILE* f, NODE *n, int detail)
|
void Node::printNode(FILE* f, int detail) const
|
||||||
{
|
{
|
||||||
if (n == NULL)
|
switch (m_type)
|
||||||
return;
|
|
||||||
|
|
||||||
switch (n->type)
|
|
||||||
{
|
{
|
||||||
case NODE_VAR:
|
case NODE_VAR:
|
||||||
regexp_print_letter(f, n->var);
|
regexp_print_letter(f, m_var);
|
||||||
break;
|
break;
|
||||||
case NODE_OR:
|
case NODE_OR:
|
||||||
fprintf(f, "OR");
|
fprintf(f, "OR");
|
||||||
|
@ -292,71 +243,54 @@ static void print_node(FILE* f, NODE *n, int detail)
|
||||||
if (detail == 2)
|
if (detail == 2)
|
||||||
{
|
{
|
||||||
fprintf(f, "\\n pos=%d\\n annul=%d\\n PP=0x%04x\\n DP=0x%04x",
|
fprintf(f, "\\n pos=%d\\n annul=%d\\n PP=0x%04x\\n DP=0x%04x",
|
||||||
n->position, n->annulable, n->PP, n->DP);
|
m_position, m_annulable, m_PP, m_DP);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
void Node::printNodesRec(FILE* f, int detail) const
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
|
||||||
static void print_tree_nodes(FILE* f, NODE* n, int detail)
|
|
||||||
{
|
{
|
||||||
if (n == NULL)
|
if (m_fg)
|
||||||
return;
|
m_fg->printNodesRec(f, detail);
|
||||||
|
if (m_fd)
|
||||||
|
m_fd->printNodesRec(f, detail);
|
||||||
|
|
||||||
print_tree_nodes(f, n->fg, detail);
|
fprintf(f, "%d [ label=\"", m_number);
|
||||||
print_tree_nodes(f, n->fd, detail);
|
printNode(f, detail);
|
||||||
|
|
||||||
fprintf(f, "%d [ label=\"", n->number);
|
|
||||||
print_node(f, n, detail);
|
|
||||||
fprintf(f, "\"];\n");
|
fprintf(f, "\"];\n");
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
void Node::printEdgesRec(FILE *f) const
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
|
||||||
static void print_tree_edges(FILE *f, NODE *n)
|
|
||||||
{
|
{
|
||||||
if (n == NULL)
|
if (m_fg)
|
||||||
return;
|
m_fg->printEdgesRec(f);
|
||||||
|
if (m_fd)
|
||||||
|
m_fd->printEdgesRec(f);
|
||||||
|
|
||||||
print_tree_edges(f, n->fg);
|
switch (m_type)
|
||||||
print_tree_edges(f, n->fd);
|
|
||||||
|
|
||||||
switch (n->type)
|
|
||||||
{
|
{
|
||||||
case NODE_OR:
|
case NODE_OR:
|
||||||
fprintf(f, "%d -> %d;", n->number, n->fg->number);
|
fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
|
||||||
fprintf(f, "%d -> %d;", n->number, n->fd->number);
|
fprintf(f, "%d -> %d;", m_number, m_fd->m_number);
|
||||||
break;
|
break;
|
||||||
case NODE_AND:
|
case NODE_AND:
|
||||||
fprintf(f, "%d -> %d;", n->number, n->fg->number);
|
fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
|
||||||
fprintf(f, "%d -> %d;", n->number, n->fd->number);
|
fprintf(f, "%d -> %d;", m_number, m_fd->m_number);
|
||||||
break;
|
break;
|
||||||
case NODE_PLUS:
|
case NODE_PLUS:
|
||||||
case NODE_STAR:
|
case NODE_STAR:
|
||||||
fprintf(f, "%d -> %d;", n->number, n->fg->number);
|
fprintf(f, "%d -> %d;", m_number, m_fg->m_number);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/*////////////////////////////////////////////////
|
void Node::printTreeDot(const string &iFileName, int detail) const
|
||||||
////////////////////////////////////////////////*/
|
|
||||||
|
|
||||||
#ifdef DEBUG_RE
|
|
||||||
void regexp_print_tree(NODE* n, const string &iName, int detail)
|
|
||||||
{
|
{
|
||||||
FILE *f = fopen(iName.c_str(), "w");
|
FILE *f = fopen(iFileName.c_str(), "w");
|
||||||
if (f == NULL)
|
if (f == NULL)
|
||||||
return;
|
return;
|
||||||
fprintf(f, "digraph %s {\n", iName.c_str());
|
fprintf(f, "digraph %s {\n", iFileName.c_str());
|
||||||
print_tree_nodes(f, n, detail);
|
printNodesRec(f, detail);
|
||||||
print_tree_edges(f, n);
|
printEdgesRec(f);
|
||||||
fprintf(f, "fontsize=20;\n");
|
fprintf(f, "fontsize=20;\n");
|
||||||
fprintf(f, "}\n");
|
fprintf(f, "}\n");
|
||||||
fclose(f);
|
fclose(f);
|
||||||
|
@ -369,7 +303,7 @@ void regexp_print_tree(NODE* n, const string &iName, int detail)
|
||||||
}
|
}
|
||||||
else if (pid == 0)
|
else if (pid == 0)
|
||||||
{
|
{
|
||||||
execlp("dotty", "dotty", iName.c_str(), NULL);
|
execlp("dotty", "dotty", iFileName.c_str(), NULL);
|
||||||
printf("exec dotty failed\n");
|
printf("exec dotty failed\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
192
dic/regexp.h
192
dic/regexp.h
|
@ -28,6 +28,8 @@
|
||||||
#ifndef _REGEXP_H_
|
#ifndef _REGEXP_H_
|
||||||
#define _REGEXP_H_
|
#define _REGEXP_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#define NODE_TOP 0
|
#define NODE_TOP 0
|
||||||
#define NODE_VAR 1
|
#define NODE_VAR 1
|
||||||
#define NODE_OR 2
|
#define NODE_OR 2
|
||||||
|
@ -35,96 +37,31 @@
|
||||||
#define NODE_STAR 4
|
#define NODE_STAR 4
|
||||||
#define NODE_PLUS 5
|
#define NODE_PLUS 5
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
|
||||||
typedef struct node
|
class Node
|
||||||
{
|
{
|
||||||
int type;
|
public:
|
||||||
char var;
|
|
||||||
struct node *fg;
|
|
||||||
struct node *fd;
|
|
||||||
int number;
|
|
||||||
int position;
|
|
||||||
int annulable;
|
|
||||||
int PP;
|
|
||||||
int DP;
|
|
||||||
} NODE;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* different letters in the dictionary
|
|
||||||
*/
|
|
||||||
#define DIC_LETTERS 27
|
|
||||||
|
|
||||||
/**
|
|
||||||
* maximum number of accepted terminals in regular expressions
|
|
||||||
*/
|
|
||||||
#define REGEXP_MAX 32
|
|
||||||
|
|
||||||
/**
|
|
||||||
* special terminals that should not appear in the dictionary
|
|
||||||
*/
|
|
||||||
#define RE_EPSILON (DIC_LETTERS + 0)
|
|
||||||
#define RE_FINAL_TOK (DIC_LETTERS + 1)
|
|
||||||
#define RE_ALL_MATCH (DIC_LETTERS + 2)
|
|
||||||
#define RE_VOWL_MATCH (DIC_LETTERS + 3)
|
|
||||||
#define RE_CONS_MATCH (DIC_LETTERS + 4)
|
|
||||||
#define RE_USR1_MATCH (DIC_LETTERS + 5)
|
|
||||||
#define RE_USR2_MATCH (DIC_LETTERS + 6)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* number of lists for regexp letter match \n
|
|
||||||
* 0 : all tiles \n
|
|
||||||
* 1 : vowels \n
|
|
||||||
* 2 : consonants \n
|
|
||||||
* 3 : user defined 1 \n
|
|
||||||
* 4 : user defined 2 \n
|
|
||||||
* x : lists used during parsing \n
|
|
||||||
*/
|
|
||||||
#define DIC_SEARCH_REGE_LIST (REGEXP_MAX)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Structure used for Dic_search_RegE \n
|
|
||||||
* this structure is used to explicit letters list that will be matched
|
|
||||||
* against special tokens in the regular expression search
|
|
||||||
*/
|
|
||||||
struct search_RegE_list_t {
|
|
||||||
/** maximum length for results */
|
|
||||||
int minlength;
|
|
||||||
/** maximum length for results */
|
|
||||||
int maxlength;
|
|
||||||
/** special symbol associated with the list */
|
|
||||||
char symbl[DIC_SEARCH_REGE_LIST];
|
|
||||||
/** 0 or 1 if list is valid */
|
|
||||||
int valid[DIC_SEARCH_REGE_LIST];
|
|
||||||
/** 0 or 1 if letter is present in the list */
|
|
||||||
char letters[DIC_SEARCH_REGE_LIST][DIC_LETTERS];
|
|
||||||
};
|
|
||||||
|
|
||||||
#define RE_LIST_ALL_MATCH 0
|
|
||||||
#define RE_LIST_VOYL_MATCH 1
|
|
||||||
#define RE_LIST_CONS_MATCH 2
|
|
||||||
#define RE_LIST_USER_BEGIN 3
|
|
||||||
#define RE_LIST_USER_END 4
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a node for the syntactic tree used for
|
* Create a node for the syntactic tree used for
|
||||||
* parsing regular expressions \n
|
* parsing regular expressions
|
||||||
* The fonction is called by bison grammar rules
|
|
||||||
*/
|
*/
|
||||||
NODE* regexp_createNODE(int type,char v,NODE *fg,NODE *fd);
|
Node(int type, char v, Node *fg, Node *fd);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* delete regexp syntactic tree
|
* Delete regexp syntactic tree
|
||||||
*/
|
*/
|
||||||
void regexp_delete_tree(NODE * root);
|
~Node();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes positions, first positions (PP), last position (DP)
|
* Computes positions, first positions (PP), last position (DP),
|
||||||
* and translation table 'position to letter' (ptl)
|
* and annulable attribute
|
||||||
|
*
|
||||||
* @param p : max position found in the tree (must be initialized to 1)
|
* @param p : max position found in the tree (must be initialized to 1)
|
||||||
* @param n : number of nodes in the tree (must be initialized to 1)
|
* @param n : number of nodes in the tree (must be initialized to 1)
|
||||||
* @param ptl : position to letter translation table
|
* @param ptl : position to letter translation table
|
||||||
*/
|
*/
|
||||||
void regexp_parcours(NODE* r, int *p, int *n, int ptl[]);
|
void traverse(int &p, int &n, int ptl[]);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes 'next position' table used for building the
|
* Computes 'next position' table used for building the
|
||||||
|
@ -133,14 +70,106 @@ void regexp_parcours(NODE* r, int *p, int *n, int ptl[]);
|
||||||
* @param PS : next position table, PS[0] must contain the
|
* @param PS : next position table, PS[0] must contain the
|
||||||
* number of terminals contained in the regular expression
|
* number of terminals contained in the regular expression
|
||||||
*/
|
*/
|
||||||
void regexp_possuivante(NODE* r, int PS[]);
|
void nextPos(int PS[]);
|
||||||
|
|
||||||
|
/// Return the first position
|
||||||
|
int getFirstPos() const { return m_PP; }
|
||||||
|
|
||||||
|
#ifdef DEBUG_RE
|
||||||
|
/**
|
||||||
|
* Print the tree rooted at the current node to a file suitable
|
||||||
|
* for dot (Graphviz)
|
||||||
|
*/
|
||||||
|
void printTreeDot(const string &iFileName, int detail) const;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
private:
|
||||||
|
int m_type;
|
||||||
|
char m_var;
|
||||||
|
Node *m_fg;
|
||||||
|
Node *m_fd;
|
||||||
|
int m_number;
|
||||||
|
int m_position;
|
||||||
|
bool m_annulable;
|
||||||
|
int m_PP;
|
||||||
|
int m_DP;
|
||||||
|
|
||||||
|
#ifdef DEBUG_RE
|
||||||
|
/// Print the current node to file
|
||||||
|
void printNode(FILE* f, int detail) const;
|
||||||
|
|
||||||
|
/// Print recursively the current node and its subnodes to file
|
||||||
|
void printNodesRec(FILE *f, int detail) const;
|
||||||
|
|
||||||
|
/// Print recursively the edges of the tree rooted at the current node
|
||||||
|
void printEdgesRec(FILE *f) const;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* different letters in the dictionary
|
||||||
|
*/
|
||||||
|
#define DIC_LETTERS 63
|
||||||
|
|
||||||
|
/**
|
||||||
|
* maximum number of accepted terminals in regular expressions
|
||||||
|
*/
|
||||||
|
#define REGEXP_MAX 32
|
||||||
|
|
||||||
|
/**
|
||||||
|
* special terminals that should not appear in the dictionary
|
||||||
|
*/
|
||||||
|
#define RE_EPSILON (DIC_LETTERS + 0)
|
||||||
|
#define RE_FINAL_TOK (DIC_LETTERS + 1)
|
||||||
|
#define RE_ALL_MATCH (DIC_LETTERS + 2)
|
||||||
|
#define RE_VOWL_MATCH (DIC_LETTERS + 3)
|
||||||
|
#define RE_CONS_MATCH (DIC_LETTERS + 4)
|
||||||
|
#define RE_USR1_MATCH (DIC_LETTERS + 5)
|
||||||
|
#define RE_USR2_MATCH (DIC_LETTERS + 6)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* number of lists for regexp letter match \n
|
||||||
|
* 0 : all tiles \n
|
||||||
|
* 1 : vowels \n
|
||||||
|
* 2 : consonants \n
|
||||||
|
* 3 : user defined 1 \n
|
||||||
|
* 4 : user defined 2 \n
|
||||||
|
* x : lists used during parsing \n
|
||||||
|
*/
|
||||||
|
#define DIC_SEARCH_REGE_LIST (REGEXP_MAX)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Structure used for Dic_search_RegE \n
|
||||||
|
* this structure is used to explicit letters list that will be matched
|
||||||
|
* against special tokens in the regular expression search
|
||||||
|
*/
|
||||||
|
struct search_RegE_list_t
|
||||||
|
{
|
||||||
|
/** maximum length for results */
|
||||||
|
int minlength;
|
||||||
|
/** maximum length for results */
|
||||||
|
int maxlength;
|
||||||
|
/** special symbol associated with the list */
|
||||||
|
char symbl[DIC_SEARCH_REGE_LIST];
|
||||||
|
/** 0 or 1 if list is valid */
|
||||||
|
bool valid[DIC_SEARCH_REGE_LIST];
|
||||||
|
/** 0 or 1 if letter is present in the list */
|
||||||
|
bool letters[DIC_SEARCH_REGE_LIST][DIC_LETTERS];
|
||||||
|
};
|
||||||
|
|
||||||
|
#define RE_LIST_ALL_MATCH 0
|
||||||
|
#define RE_LIST_VOYL_MATCH 1
|
||||||
|
#define RE_LIST_CONS_MATCH 2
|
||||||
|
#define RE_LIST_USER_BEGIN 3
|
||||||
|
#define RE_LIST_USER_END 4
|
||||||
|
|
||||||
#define MAX_REGEXP_ERROR_LENGTH 500
|
#define MAX_REGEXP_ERROR_LENGTH 500
|
||||||
|
|
||||||
struct regexp_error_report_t {
|
struct regexp_error_report_t
|
||||||
int pos1;
|
{
|
||||||
int pos2;
|
int pos1;
|
||||||
char msg[MAX_REGEXP_ERROR_LENGTH];
|
int pos2;
|
||||||
|
char msg[MAX_REGEXP_ERROR_LENGTH];
|
||||||
};
|
};
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -149,7 +178,6 @@ void regexp_print_letter(FILE* f, char l);
|
||||||
void regexp_print_letter2(FILE* f, char l);
|
void regexp_print_letter2(FILE* f, char l);
|
||||||
void regexp_print_PS(int PS[]);
|
void regexp_print_PS(int PS[]);
|
||||||
void regexp_print_ptl(int ptl[]);
|
void regexp_print_ptl(int ptl[]);
|
||||||
void regexp_print_tree(NODE* n, char* name, int detail);
|
|
||||||
|
|
||||||
#endif /* _REGEXP_H_ */
|
#endif /* _REGEXP_H_ */
|
||||||
|
|
||||||
|
|
|
@ -40,62 +40,40 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "dic.h"
|
#include "dic.h"
|
||||||
|
#include "header.h"
|
||||||
#include "regexp.h"
|
#include "regexp.h"
|
||||||
#include "encoding.h"
|
#include "encoding.h"
|
||||||
|
|
||||||
|
|
||||||
#define __UNUSED__ __attribute__((unused))
|
void init_letter_lists(const Dictionary &iDic, struct search_RegE_list_t *iList)
|
||||||
|
|
||||||
/********************************************************/
|
|
||||||
/********************************************************/
|
|
||||||
/********************************************************/
|
|
||||||
|
|
||||||
const unsigned int all_letter[DIC_LETTERS] =
|
|
||||||
{
|
{
|
||||||
/* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 */
|
memset(iList, 0, sizeof(*iList));
|
||||||
/* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 */
|
|
||||||
/* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
|
|
||||||
0,1,1,1,1, 1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1, 1, 1, 1, 1
|
|
||||||
};
|
|
||||||
|
|
||||||
const unsigned int vowels[DIC_LETTERS] =
|
|
||||||
{
|
|
||||||
/* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
|
|
||||||
0,1,0,0,0, 1,0,0,0,1,0, 0,0,0,0,1,0,0,0,0,0,1,0, 0, 0, 1, 0
|
|
||||||
};
|
|
||||||
|
|
||||||
const unsigned int consonants[DIC_LETTERS] =
|
|
||||||
{
|
|
||||||
/* x A B C D E F G H I J K L M N O P Q R S T U V W X Y Z */
|
|
||||||
0,0,1,1,1, 0,1,1,1,0,1, 1,1,1,1,0,1,1,1,1,1,0,1, 1, 1, 1, 1
|
|
||||||
};
|
|
||||||
|
|
||||||
void init_letter_lists(struct search_RegE_list_t *iList)
|
|
||||||
{
|
|
||||||
memset (iList, 0, sizeof(*iList));
|
|
||||||
iList->minlength = 1;
|
iList->minlength = 1;
|
||||||
iList->maxlength = 15;
|
iList->maxlength = 15;
|
||||||
iList->valid[0] = 1; // all letters
|
iList->valid[0] = true; // all letters
|
||||||
iList->symbl[0] = RE_ALL_MATCH;
|
iList->symbl[0] = RE_ALL_MATCH;
|
||||||
iList->valid[1] = 1; // vowels
|
iList->valid[1] = true; // vowels
|
||||||
iList->symbl[1] = RE_VOWL_MATCH;
|
iList->symbl[1] = RE_VOWL_MATCH;
|
||||||
iList->valid[2] = 1; // consonants
|
iList->valid[2] = true; // consonants
|
||||||
iList->symbl[2] = RE_CONS_MATCH;
|
iList->symbl[2] = RE_CONS_MATCH;
|
||||||
for (int i = 0; i < DIC_LETTERS; i++)
|
iList->letters[0][0] = false;
|
||||||
|
iList->letters[1][0] = false;
|
||||||
|
iList->letters[2][0] = false;
|
||||||
|
const wstring &allLetters = iDic.getHeader().getLetters();
|
||||||
|
for (size_t i = 1; i <= allLetters.size(); ++i)
|
||||||
{
|
{
|
||||||
iList->letters[0][i] = all_letter[i];
|
iList->letters[0][i] = true;
|
||||||
iList->letters[1][i] = vowels[i];
|
iList->letters[1][i] = iDic.getHeader().isVowel(i);
|
||||||
iList->letters[2][i] = consonants[i];
|
iList->letters[2][i] = iDic.getHeader().isConsonant(i);
|
||||||
}
|
}
|
||||||
iList->valid[3] = 0; // user defined list 1
|
|
||||||
|
iList->valid[3] = false; // user defined list 1
|
||||||
iList->symbl[3] = RE_USR1_MATCH;
|
iList->symbl[3] = RE_USR1_MATCH;
|
||||||
iList->valid[4] = 0; // user defined list 2
|
iList->valid[4] = false; // user defined list 2
|
||||||
iList->symbl[4] = RE_USR2_MATCH;
|
iList->symbl[4] = RE_USR2_MATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
/********************************************************/
|
|
||||||
/********************************************************/
|
|
||||||
/********************************************************/
|
|
||||||
void usage(const char *iBinaryName)
|
void usage(const char *iBinaryName)
|
||||||
{
|
{
|
||||||
cerr << _("usage: %s dictionary") << iBinaryName << endl;
|
cerr << _("usage: %s dictionary") << iBinaryName << endl;
|
||||||
|
@ -142,7 +120,7 @@ int main(int argc, char* argv[])
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* automaton */
|
/* automaton */
|
||||||
init_letter_lists(®List);
|
init_letter_lists(dic, ®List);
|
||||||
vector<wstring> wordList;
|
vector<wstring> wordList;
|
||||||
dic.searchRegExp(convertToWc(er), wordList, ®List);
|
dic.searchRegExp(convertToWc(er), wordList, ®List);
|
||||||
|
|
||||||
|
@ -163,7 +141,7 @@ int main(int argc, char* argv[])
|
||||||
}
|
}
|
||||||
catch (...)
|
catch (...)
|
||||||
{
|
{
|
||||||
std::cerr << "Unkown exception taken" << endl;
|
std::cerr << "Unknown exception taken" << endl;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,5 +12,11 @@ x .*(cba)*b
|
||||||
x .*(cba)+b
|
x .*(cba)+b
|
||||||
x .*(nn)+.*
|
x .*(nn)+.*
|
||||||
x .*(nn)+.*x 200
|
x .*(nn)+.*x 200
|
||||||
|
x ne.
|
||||||
|
x ne:v:
|
||||||
|
x ne:v:?
|
||||||
|
x ne:c:s
|
||||||
|
x (ass)+..
|
||||||
|
x c:v:+p
|
||||||
q
|
q
|
||||||
|
|
||||||
|
|
|
@ -539,4 +539,52 @@ vallonneux
|
||||||
vanneaux
|
vanneaux
|
||||||
vicennaux
|
vicennaux
|
||||||
57 printed results
|
57 printed results
|
||||||
|
commande> x ne.
|
||||||
|
search for ne. (50,1,15)
|
||||||
|
nee
|
||||||
|
nef
|
||||||
|
nem
|
||||||
|
neo
|
||||||
|
nes
|
||||||
|
net
|
||||||
|
ney
|
||||||
|
nez
|
||||||
|
8 printed results
|
||||||
|
commande> x ne:v:
|
||||||
|
search for ne:v: (50,1,15)
|
||||||
|
nee
|
||||||
|
neo
|
||||||
|
ney
|
||||||
|
3 printed results
|
||||||
|
commande> x ne:v:?
|
||||||
|
search for ne:v:? (50,1,15)
|
||||||
|
ne
|
||||||
|
nee
|
||||||
|
neo
|
||||||
|
ney
|
||||||
|
4 printed results
|
||||||
|
commande> x ne:c:s
|
||||||
|
search for ne:c:s (50,1,15)
|
||||||
|
nefs
|
||||||
|
nems
|
||||||
|
nets
|
||||||
|
news
|
||||||
|
neys
|
||||||
|
5 printed results
|
||||||
|
commande> x (ass)+..
|
||||||
|
search for (ass)+.. (50,1,15)
|
||||||
|
assai
|
||||||
|
assassin
|
||||||
|
assec
|
||||||
|
asses
|
||||||
|
assez
|
||||||
|
assis
|
||||||
|
assit
|
||||||
|
7 printed results
|
||||||
|
commande> x c:v:+p
|
||||||
|
search for c:v:+p (50,1,15)
|
||||||
|
cap
|
||||||
|
cep
|
||||||
|
coup
|
||||||
|
3 printed results
|
||||||
commande> q
|
commande> q
|
||||||
|
|
|
@ -800,11 +800,11 @@ void eliot_regexp_build_default_llist(const Dictionary &iDic,
|
||||||
llist.symbl[3] = RE_USR1_MATCH;
|
llist.symbl[3] = RE_USR1_MATCH;
|
||||||
llist.symbl[5] = RE_USR2_MATCH;
|
llist.symbl[5] = RE_USR2_MATCH;
|
||||||
|
|
||||||
llist.valid[0] = 1; // all letters
|
llist.valid[0] = true; // all letters
|
||||||
llist.valid[1] = 1; // vowels
|
llist.valid[1] = true; // vowels
|
||||||
llist.valid[2] = 1; // consonants
|
llist.valid[2] = true; // consonants
|
||||||
llist.valid[3] = 0; // user defined list 1
|
llist.valid[3] = false; // user defined list 1
|
||||||
llist.valid[4] = 0; // user defined list 2
|
llist.valid[4] = false; // user defined list 2
|
||||||
|
|
||||||
for (int i = 0; i < DIC_SEARCH_REGE_LIST; i++)
|
for (int i = 0; i < DIC_SEARCH_REGE_LIST; i++)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue