2008-01-08 14:52:32 +01:00
|
|
|
/*****************************************************************************
|
|
|
|
* Eliot
|
2012-10-07 16:25:41 +02:00
|
|
|
* Copyright (C) 1999-2012 Antoine Fraboulet
|
2008-01-08 14:52:32 +01:00
|
|
|
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*****************************************************************************/
|
2005-04-19 18:26:50 +02:00
|
|
|
|
2009-11-29 17:01:31 +01:00
|
|
|
#ifndef REGEXP_H_
|
|
|
|
#define REGEXP_H_
|
2004-06-20 22:13:59 +02:00
|
|
|
|
2008-07-07 19:29:59 +02:00
|
|
|
#include <string>
|
2012-05-16 21:36:26 +02:00
|
|
|
#include <vector>
|
|
|
|
#include <iosfwd>
|
2008-07-07 19:29:59 +02:00
|
|
|
|
2004-06-20 22:13:59 +02:00
|
|
|
#define NODE_TOP 0
|
|
|
|
#define NODE_VAR 1
|
|
|
|
#define NODE_OR 2
|
|
|
|
#define NODE_AND 3
|
|
|
|
#define NODE_STAR 4
|
2005-04-25 10:18:24 +02:00
|
|
|
#define NODE_PLUS 5
|
2004-06-20 22:13:59 +02:00
|
|
|
|
2008-07-07 19:29:59 +02:00
|
|
|
using std::string;
|
2012-05-16 21:36:26 +02:00
|
|
|
using std::vector;
|
2008-01-08 14:52:32 +01:00
|
|
|
|
2008-07-07 19:29:59 +02:00
|
|
|
class Node
|
2008-01-08 14:52:32 +01:00
|
|
|
{
|
2008-07-07 19:29:59 +02:00
|
|
|
public:
|
|
|
|
/**
|
|
|
|
* Create a node for the syntactic tree used for
|
|
|
|
* parsing regular expressions
|
|
|
|
*/
|
|
|
|
Node(int type, char v, Node *fg, Node *fd);
|
2004-06-20 22:13:59 +02:00
|
|
|
|
2008-01-08 14:52:32 +01:00
|
|
|
/**
|
2008-07-07 19:29:59 +02:00
|
|
|
* Delete regexp syntactic tree
|
2008-01-08 14:52:32 +01:00
|
|
|
*/
|
2008-07-07 19:29:59 +02:00
|
|
|
~Node();
|
2008-01-08 14:52:32 +01:00
|
|
|
|
2005-04-27 19:35:03 +02:00
|
|
|
/**
|
2008-07-07 19:29:59 +02:00
|
|
|
* Computes positions, first positions (PP), last position (DP),
|
|
|
|
* and annulable attribute
|
|
|
|
*
|
|
|
|
* @param p : max position found in the tree (must be initialized to 1)
|
|
|
|
* @param n : number of nodes in the tree (must be initialized to 1)
|
|
|
|
* @param ptl : position to letter translation table
|
2005-04-27 19:35:03 +02:00
|
|
|
*/
|
2008-07-07 19:29:59 +02:00
|
|
|
void traverse(int &p, int &n, int ptl[]);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Computes 'next position' table used for building the
|
|
|
|
* automaton
|
|
|
|
* @param r : root node of the syntactic tree
|
|
|
|
* @param PS : next position table, PS[0] must contain the
|
|
|
|
* number of terminals contained in the regular expression
|
|
|
|
*/
|
2008-07-13 09:55:47 +02:00
|
|
|
void nextPos(uint64_t PS[]);
|
2008-07-07 19:29:59 +02:00
|
|
|
|
|
|
|
/// Return the first position
|
|
|
|
int getFirstPos() const { return m_PP; }
|
2005-04-09 21:16:09 +02:00
|
|
|
|
2008-07-07 19:29:59 +02:00
|
|
|
#ifdef DEBUG_RE
|
2005-11-04 21:00:05 +01:00
|
|
|
/**
|
2008-07-07 19:29:59 +02:00
|
|
|
* Print the tree rooted at the current node to a file suitable
|
|
|
|
* for dot (Graphviz)
|
2005-04-27 19:35:03 +02:00
|
|
|
*/
|
2008-07-07 19:29:59 +02:00
|
|
|
void printTreeDot(const string &iFileName, int detail) const;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
private:
|
|
|
|
int m_type;
|
|
|
|
char m_var;
|
|
|
|
Node *m_fg;
|
|
|
|
Node *m_fd;
|
|
|
|
int m_number;
|
|
|
|
int m_position;
|
|
|
|
bool m_annulable;
|
2008-07-13 09:55:47 +02:00
|
|
|
uint64_t m_PP;
|
|
|
|
uint64_t m_DP;
|
2008-07-07 19:29:59 +02:00
|
|
|
|
|
|
|
#ifdef DEBUG_RE
|
|
|
|
/// Print the current node to file
|
2012-05-16 21:36:26 +02:00
|
|
|
void printNode(ostream &out, int detail) const;
|
2008-07-07 19:29:59 +02:00
|
|
|
|
|
|
|
/// Print recursively the current node and its subnodes to file
|
2012-05-16 21:36:26 +02:00
|
|
|
void printNodesRec(ostream &out, int detail) const;
|
2008-07-07 19:29:59 +02:00
|
|
|
|
|
|
|
/// Print recursively the edges of the tree rooted at the current node
|
2012-05-16 21:36:26 +02:00
|
|
|
void printEdgesRec(ostream &out) const;
|
2008-07-07 19:29:59 +02:00
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* different letters in the dictionary
|
|
|
|
*/
|
|
|
|
#define DIC_LETTERS 63
|
|
|
|
|
|
|
|
/**
|
|
|
|
* maximum number of accepted terminals in regular expressions
|
|
|
|
*/
|
|
|
|
#define REGEXP_MAX 32
|
|
|
|
|
|
|
|
/**
|
|
|
|
* special terminals that should not appear in the dictionary
|
|
|
|
*/
|
2005-05-06 01:45:04 +02:00
|
|
|
#define RE_EPSILON (DIC_LETTERS + 0)
|
2005-04-27 19:35:03 +02:00
|
|
|
#define RE_FINAL_TOK (DIC_LETTERS + 1)
|
|
|
|
#define RE_ALL_MATCH (DIC_LETTERS + 2)
|
|
|
|
#define RE_VOWL_MATCH (DIC_LETTERS + 3)
|
|
|
|
#define RE_CONS_MATCH (DIC_LETTERS + 4)
|
|
|
|
#define RE_USR1_MATCH (DIC_LETTERS + 5)
|
|
|
|
#define RE_USR2_MATCH (DIC_LETTERS + 6)
|
2005-11-04 21:00:05 +01:00
|
|
|
|
2008-07-07 19:29:59 +02:00
|
|
|
/**
|
2008-07-27 15:32:47 +02:00
|
|
|
* Structure used for dic.searchRegExp
|
|
|
|
* This structure is used to explicit letters list that will be matched
|
2008-07-07 19:29:59 +02:00
|
|
|
* against special tokens in the regular expression search
|
|
|
|
*/
|
2008-07-27 17:28:50 +02:00
|
|
|
struct searchRegExpLists
|
2008-07-07 19:29:59 +02:00
|
|
|
{
|
|
|
|
/** special symbol associated with the list */
|
2008-07-27 15:32:47 +02:00
|
|
|
vector<char> symbl;
|
2008-07-27 17:28:50 +02:00
|
|
|
/**
|
|
|
|
* 0 or 1 if letter is present in the list.
|
2008-07-28 20:37:09 +02:00
|
|
|
* The inner vector should have a length of DIC_LETTERS+1 (it is a bitmask)
|
2008-07-27 17:28:50 +02:00
|
|
|
*/
|
|
|
|
vector<vector<bool> > letters;
|
2005-04-27 19:35:03 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#define RE_LIST_ALL_MATCH 0
|
|
|
|
#define RE_LIST_VOYL_MATCH 1
|
|
|
|
#define RE_LIST_CONS_MATCH 2
|
|
|
|
#define RE_LIST_USER_BEGIN 3
|
|
|
|
#define RE_LIST_USER_END 4
|
|
|
|
|
2012-05-16 20:43:26 +02:00
|
|
|
string regexpPrintLetter(char l);
|
2008-07-27 15:32:47 +02:00
|
|
|
void regexp_print_PS(int PS[]);
|
|
|
|
void regexp_print_ptl(int ptl[]);
|
2004-06-20 22:13:59 +02:00
|
|
|
|
2008-01-08 14:52:32 +01:00
|
|
|
#endif /* _REGEXP_H_ */
|
2006-01-01 20:51:00 +01:00
|
|
|
|