eliot/dic/regexp.cpp
Olivier Teulière e7a8d01a8d Merged the "cppdic" branch back into HEAD.
There are too many change to list properly, here is an overview of the main changes:
 - the dictionary is now in C++
 - the dictionary has a new format, where it is possible to specify the letters,
   their points, their frequency, ... It is backwards compatible.
 - Eliot now supports non-ASCII characters everywhere
 - i18n of the compdic, listdic, regexpmain binaries
 - i18n of the wxWidgets interface (now in english by default)
2008-01-08 13:52:32 +00:00

384 lines
9.8 KiB
C++

/*****************************************************************************
* Eliot
* Copyright (C) 1999-2006 Antoine Fraboulet
* Authors: Antoine Fraboulet <antoine.fraboulet @@ free.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*****************************************************************************/
/**
* \file regexp.c
* \brief Regular Expression functions
* \author Antoine Fraboulet
* \date 2005
*/
#include "config.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#ifdef HAVE_SYS_WAIT_H
# include <sys/wait.h>
#endif
#include <unistd.h>
#include "dic.h"
#include "regexp.h"
#include "automaton.h"
#ifndef PDBG
#ifdef DEBUG_RE2
#define PDBG(x) x
#else
#define PDBG(x)
#endif
#endif
NODE* regexp_createNODE(int type, char v, NODE *fg, NODE *fd)
{
NODE *x;
x=(NODE *)malloc(sizeof(NODE));
x->type = type;
x->var = v;
x->fd = fd;
x->fg = fg;
x->number = 0;
x->position = 0;
x->annulable = 0;
x->PP = 0;
x->DP = 0;
return x;
}
void regexp_delete_tree(NODE *root)
{
if (root == NULL)
return;
regexp_delete_tree(root->fg);
regexp_delete_tree(root->fd);
free(root);
}
#ifdef DEBUG_RE
static void print_node(FILE*, NODE *n, int detail);
#endif
/**
* computes position, annulable, PP, DP attributes
* @param r = root
* @param p = current leaf position
* @param n = current node number
* @param ptl = position to letter
*/
void regexp_parcours(NODE* r, int *p, int *n, int ptl[])
{
if (r == NULL)
return;
regexp_parcours(r->fg, p, n, ptl);
regexp_parcours(r->fd, p, n, ptl);
switch (r->type)
{
case NODE_VAR:
r->position = *p;
ptl[*p] = r->var;
*p = *p + 1;
r->annulable = 0;
r->PP = 1 << (r->position - 1);
r->DP = 1 << (r->position - 1);
break;
case NODE_OR:
r->position = 0;
r->annulable = r->fg->annulable || r->fd->annulable;
r->PP = r->fg->PP | r->fd->PP;
r->DP = r->fg->DP | r->fd->DP;
break;
case NODE_AND:
r->position = 0;
r->annulable = r->fg->annulable && r->fd->annulable;
r->PP = (r->fg->annulable) ? (r->fg->PP | r->fd->PP) : r->fg->PP;
r->DP = (r->fd->annulable) ? (r->fg->DP | r->fd->DP) : r->fd->DP;
break;
case NODE_PLUS:
r->position = 0;
r->annulable = 0;
r->PP = r->fg->PP;
r->DP = r->fg->DP;
break;
case NODE_STAR:
r->position = 0;
r->annulable = 1;
r->PP = r->fg->PP;
r->DP = r->fg->DP;
break;
}
r->number = *n;
*n = *n + 1;
}
/**
* computes possuivante
* @param r = root
* @param PS = next position
*/
void regexp_possuivante(NODE* r, int PS[])
{
if (r == NULL)
return;
regexp_possuivante(r->fg, PS);
regexp_possuivante(r->fd, PS);
switch (r->type)
{
case NODE_AND:
/************************************/
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(right) */
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
if (r->fg->DP & (1 << (pos-1)))
PS[pos] |= r->fd->PP;
}
break;
case NODE_PLUS:
/************************************/
/* == same as START */
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(left) */
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
if (r->DP & (1 << (pos-1)))
PS[pos] |= r->PP;
}
break;
case NODE_STAR:
/************************************/
/* \forall p \in DP(left) */
/* PS[p] = PS[p] \cup PP(left) */
/************************************/
for (int pos = 1; pos <= PS[0]; pos++)
{
if (r->DP & (1 << (pos-1)))
PS[pos] |= r->PP;
}
break;
}
}
/*////////////////////////////////////////////////
// DEBUG only fonctions
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
void regexp_print_PS(int PS[])
{
printf("** positions suivantes **\n");
for (int i = 1; i <= PS[0]; i++)
{
printf("%02d: 0x%08x\n", i, PS[i]);
}
}
#endif
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
void regexp_print_ptl(int ptl[])
{
printf("** pos -> lettre: ");
for (int i = 1; i <= ptl[0]; i++)
{
printf("%d=%c ", i, ptl[i]);
}
printf("\n");
}
#endif
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
void regexp_print_letter(FILE* f, char l)
{
switch (l)
{
case RE_EPSILON: fprintf(f, "( & [%d])", l); break;
case RE_FINAL_TOK: fprintf(f, "( # [%d])", l); break;
case RE_ALL_MATCH: fprintf(f, "( . [%d])", l); break;
case RE_VOWL_MATCH: fprintf(f, "(:v: [%d])", l); break;
case RE_CONS_MATCH: fprintf(f, "(:c: [%d])", l); break;
case RE_USR1_MATCH: fprintf(f, "(:1: [%d])", l); break;
case RE_USR2_MATCH: fprintf(f, "(:2: [%d])", l); break;
default:
if (l < RE_FINAL_TOK)
fprintf(f, " (%c [%d]) ", l + 'a' - 1, l);
else
fprintf(f, " (liste %d)", l - RE_LIST_USER_END);
break;
}
}
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
void regexp_print_letter2(FILE* f, char l)
{
switch (l)
{
case RE_EPSILON: fprintf(f, "&"); break;
case RE_FINAL_TOK: fprintf(f, "#"); break;
case RE_ALL_MATCH: fprintf(f, "."); break;
case RE_VOWL_MATCH: fprintf(f, ":v:"); break;
case RE_CONS_MATCH: fprintf(f, ":c:"); break;
case RE_USR1_MATCH: fprintf(f, ":1:"); break;
case RE_USR2_MATCH: fprintf(f, ":2:"); break;
default:
if (l < RE_FINAL_TOK)
fprintf(f, "%c", l + 'a' - 1);
else
fprintf(f, "l%d", l - RE_LIST_USER_END);
break;
}
}
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
static void print_node(FILE* f, NODE *n, int detail)
{
if (n == NULL)
return;
switch (n->type)
{
case NODE_VAR:
regexp_print_letter(f, n->var);
break;
case NODE_OR:
fprintf(f, "OR");
break;
case NODE_AND:
fprintf(f, "AND");
break;
case NODE_PLUS:
fprintf(f, "+");
break;
case NODE_STAR:
fprintf(f, "*");
break;
}
if (detail == 2)
{
fprintf(f, "\\n pos=%d\\n annul=%d\\n PP=0x%04x\\n DP=0x%04x",
n->position, n->annulable, n->PP, n->DP);
}
}
#endif
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
static void print_tree_nodes(FILE* f, NODE* n, int detail)
{
if (n == NULL)
return;
print_tree_nodes(f, n->fg, detail);
print_tree_nodes(f, n->fd, detail);
fprintf(f, "%d [ label=\"", n->number);
print_node(f, n, detail);
fprintf(f, "\"];\n");
}
#endif
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
static void print_tree_edges(FILE *f, NODE *n)
{
if (n == NULL)
return;
print_tree_edges(f, n->fg);
print_tree_edges(f, n->fd);
switch (n->type)
{
case NODE_OR:
fprintf(f, "%d -> %d;", n->number, n->fg->number);
fprintf(f, "%d -> %d;", n->number, n->fd->number);
break;
case NODE_AND:
fprintf(f, "%d -> %d;", n->number, n->fg->number);
fprintf(f, "%d -> %d;", n->number, n->fd->number);
break;
case NODE_PLUS:
case NODE_STAR:
fprintf(f, "%d -> %d;", n->number, n->fg->number);
break;
}
}
#endif
/*////////////////////////////////////////////////
////////////////////////////////////////////////*/
#ifdef DEBUG_RE
void regexp_print_tree(NODE* n, const string &iName, int detail)
{
FILE *f = fopen(iName.c_str(), "w");
if (f == NULL)
return;
fprintf(f, "digraph %s {\n", iName.c_str());
print_tree_nodes(f, n, detail);
print_tree_edges(f, n);
fprintf(f, "fontsize=20;\n");
fprintf(f, "}\n");
fclose(f);
#ifdef HAVE_SYS_WAIT_H
pid_t pid = fork();
if (pid > 0)
{
wait(NULL);
}
else if (pid == 0)
{
execlp("dotty", "dotty", iName.c_str(), NULL);
printf("exec dotty failed\n");
exit(1);
}
#endif
}
#endif
/// Local Variables:
/// mode: hs-minor
/// c-basic-offset: 2
/// End: