Trochu nudných základů na začátek: http://cs.wikipedia.org/wiki/Lexik%C3%A ... al%C3%BDza
lexikální analyzátor vezme vstupní soubor - zdrojáky, a ty rozseká na lexikální elementy - int, float, if, while, atd...
Takže, jak na to ? Ukážeme si to na pdomnožině C++... zdaleka to neumí všechno neumí to ani funkce, takže ani main()
první potřebujeme nějakej seznam lexikálních elementů.
LexSymbs.h:
- Kód: Vybrat vše
#ifndef __LEX_SYMBS__
#define __LEX_SYMBS__
/* zavedeni lexikalnich elementu */
#define T_NULL 0 /* dosazen EOF, v pocatecnim stavu */
#define T_ERROR 1 /* chyba, neznamy lex. element */
#define T_PLUS 2 /* operator + */
#define T_PLUSPLUS 3 /* operator ++ */
#define T_RPLUS 4 /* operator += */
#define T_MINUS 5 /* operator - */
#define T_MINUSMINUS 6 /* operator -- */
#define T_RMINUS 7 /* operator -= */
#define T_TILDA 8 /* operator ~ */
#define T_VYKR 9 /* operator ! */
#define T_RVYKR 10 /* operator != */
#define T_OTVN 11 /* operator ( */
#define T_ZAVN 12 /* operator ) */
#define T_KRAT 13 /* operator * */
#define T_RKRAT 14 /* operator *= */
#define T_MOD 15 /* operator % */
#define T_RMOD 16 /* operator %= */
#define T_LESS 17 /* operator < */
#define T_LESSLESS 18 /* operator << */
#define T_RLESS 19 /* operator <= */
#define T_RLESSLESS 20 /* operator <<= */
#define T_HIGH 21 /* operator > */
#define T_HIGHHIGH 22 /* operator >> */
#define T_RHIGH 23 /* operator >= */
#define T_RHIGHHIGH 24 /* operator >>= */
#define T_ROV 25 /* operator = */
#define T_RROV 26 /* operator == */
#define T_AMP 27 /* operator & */
#define T_AND 28 /* operator && */
#define T_RAMP 29 /* operator &= */
#define T_XOR 30 /* operator ^ */
#define T_RXOR 31 /* operator ^= */
#define T_PIPE 32 /* operator | */
#define T_OR 33 /* operator || */
#define T_RPIPE 34 /* operator |= */
#define T_SOTV 35 /* operator { */
#define T_SZAV 36 /* operator } */
#define T_DIV 37 /* operator / */
#define T_RDIV 38 /* operator /= */
#define LEX_INT 39 /* celociselna konstanta */
#define LEX_FLOAT 40 /* konstanta v plovouci radove carce */
#define LEX_IDENT 41 /* identifikator, ne klicove slovo */
#define KW_BREAK 42 /* klicove slovo break */
#define KW_CONTINUE 43 /* klicove slovo continue */
#define KW_WHILE 44 /* klicove slovo while */
#define KW_IF 45 /* klicove slovo if */
#define KW_ELSE 46 /* klicove slovo else */
#define KW_FLOAT 47 /* klicove slovo float */
#define KW_INT 48 /* klicove slovo int */
#define T_STRED 49 /* ; */
#define T_CARKA 50 /* , */
#endif
Tak, dale nejaka hlavicka pro lexikalni analyzator:
LexAnal.h
- Kód: Vybrat vše
#ifndef __LEX_ANAL__
#define __LEX_ANAL__
#include <iostream>
#include <fstream>
using namespace std;
#define NrKeywords 7
static struct /* seznam klicovych slov spolu s */
{ /* hodnotou lex. elementu */
int Token;
char Keyword [20];
} Keywords [NrKeywords] =
{
{ KW_IF, "if" },
{ KW_ELSE, "else" },
{ KW_WHILE, "while" },
{ KW_FLOAT, "float" },
{ KW_INT, "int" },
{ KW_BREAK, "break" },
{ KW_CONTINUE, "continue" }
};
class LexAnal {
public:
/* atributy nastavene lexikalnim analyzatorem */
/* platne pro LEX_INT */
int Lex_Int;
/* platne pro LEX_FLOAT */
double Lex_Float;
/* platne pro LEX_IDENT */
char Lex_Str [2048];
/* otevreny soubor, ze ktereho LA cte svuj vstup */
char * Fp;
/* reprezentuje vstupni stram */
ifstream is;
LexAnal() {
}
/* konstruktor, prijme nazev souboru ze ktereho ma LexAnal cist */
LexAnal(char* file_name) : Fp(file_name) {
is.open(file_name);
if (!is.is_open()) {
cout << "Nezdarilo se otevreni souboru pro cteni... \n";
exit(1);
}
}
/* destruktor */
~LexAnal() {
is.close();
delete Fp;
}
/* hlavni metoda, vraci cislo precteneho lexikalniho znaku */
int NextToken(void);
};
#endif
zde je hlavicka dulezite funkce - nextToken
LexAnal.cpp:
- Kód: Vybrat vše
#include "LexSymbs.h"
#include "LexAnal.h"
#include <ctype.h>
#include <cmath>
#define isodigit(x) (x>='0' && x<='7') /* makro - test, zda x je oct. cifra */
int LexAnal::NextToken() {
int State = 0, Char; /* Stav ve kterem se nachazim a precteny znak */
int i, StrPos = 0; /* StrPos = pocet nactenych znaku identifikatoru */
int Exp = 0, ExpSign;/* Velikost a znamenko exponentu */
double DmsPos = 0.1; /* prave zpracovavany desetinny rad */
while(true) {
Char = is.get();
switch (State) {
case 0:
if (Char == ' ' || Char == '\n' || Char == '\t')
break;
if (isalpha(Char) || Char == '_') {
Lex_Str[0] = Char;
StrPos = 1;
State = 1;
break;
}
if (Char == '0') {
Lex_Int = 0;
Lex_Float = 0.0;
State = 8;
break;
}
if (Char == '.') {
Lex_Float = 0.0;
DmsPos = 0.1;
State = 7;
break;
}
if (isdigit(Char)) {
Lex_Int = (Char - '0');
Lex_Float = float (Char - '0');
State = 2;
break;
}
if (Char == '/') {
State = 15;
break;
}
if (Char == '+') {
State = 20;
break;
}
if (Char == '-') {
State = 23;
break;
}
if (Char == '!') {
State = 26;
break;
}
if (Char == '*') {
State = 31;
break;
}
if (Char == '%') {
State = 33;
break;
}
if (Char == '<') {
State = 35;
break;
}
if (Char == '>') {
State = 39;
break;
}
if (Char == '=') {
State = 43;
break;
}
if (Char == '&') {
State = 45;
break;
}
if (Char == '^') {
State = 48;
break;
}
if (Char == '|') {
State = 50;
break;
}
if (Char == '~') {
return T_TILDA;
}
if (Char == '(') {
return T_OTVN;
}
if (Char == ')') {
return T_ZAVN;
}
if (Char == '{') {
return T_SOTV;
}
if (Char == '}') {
return T_SZAV;
}
if (Char == ';')
return T_STRED;
if (Char == ',')
return T_CARKA;
if (Char == EOF) {
return T_NULL;
}
return T_ERROR;
case 1:
if (isdigit(Char) || isalpha(Char) || Char == '_') {
if (StrPos < (int)sizeof (Lex_Str) - 1 ) {
Lex_Str [StrPos ++] = Char;
}
break;
}
is.unget();
Lex_Str [StrPos] = 0;
for (i=0;i<NrKeywords;i++) {
if (!strcmp(Keywords[i].Keyword, Lex_Str)) {
return Keywords[i].Token;
}
}
return LEX_IDENT;
case 2:
if (isdigit(Char)) {
Lex_Int = 10*Lex_Int + (Char - '0');
Lex_Float = 10.0*Lex_Float + (Char - '0');
break;
}
if (Char == '.') {
DmsPos = 0.1;
State = 3;
break;
}
if (Char == 'e' || Char == 'E' ) {
State = 4;
Exp = 0;
ExpSign = 1;
break;
}
is.unget();
return LEX_INT;
case 3:
if (isdigit(Char)) {
Lex_Float += DmsPos*(Char - '0');
DmsPos /= 10.0;
break;
}
if (Char == 'e' || Char == 'E' ) {
State = 4;
Exp = 0;
ExpSign = 1;
break;
}
is.unget();
return LEX_FLOAT;
case 4:
if (Char == '+' || Char == '-') {
ExpSign = (Char == '-') ? -1:1;
State = 5;
break;
}
if (isdigit(Char)) {
Exp = (Char-'0');
State = 6;
break;
}
return (T_ERROR);
case 5:
if (isdigit(Char)) {
Exp = (Char-'0');
State = 6;
break;
}
return (T_ERROR);
case 6:
if (isdigit(Char)) {
Exp = 10*Exp + (Char - '0');
break;
}
is.unget();
Lex_Float *= pow ( 10.0, Exp * ExpSign );
return (LEX_FLOAT);
case 7:
if (isdigit(Char)) {
Lex_Float = DmsPos * (Char - '0');
DmsPos /= 10.0;
State = 3;
break;
}
return (T_ERROR);
case 8:
if (Char == '.') {
DmsPos = 0.1;
State = 3;
break;
}
if (Char == 'x' || Char == 'X') {
State = 9;
break;
}
if (isodigit(Char)) {
Lex_Int = Char - '0';
State = 14;
break;
}
is.unget();
return(LEX_INT);
case 9:
if (isxdigit(Char)) {
State = 10;
Char = toupper(Char);
Lex_Int = (Char > 'A' ) ? Char - 'A' + 10 : Char - '0';
Lex_Float = (double) Lex_Int;
break;
}
return (T_ERROR);
case 10:
if (isxdigit(Char)) {
Char = toupper(Char);
Lex_Int = (Lex_Int << 4 ) + (Char > 'A' ) ? Char - 'A' + 10 : Char - '0';
Lex_Float = (double) Lex_Int;
break;
}
if (Char == 'p' || Char == 'P') {
State = 11;
Exp=0;
ExpSign = 1;
break;
}
is.unget();
return LEX_INT;
case 11:
if (isxdigit(Char)) {
Char = toupper(Char);
Exp = (Char > 'A' ) ? Char - 'A' + 10 : Char - '0';
State = 13;
break;
}
if (Char == '-' || Char == '+') {
ExpSign = (Char == '-') ? -1 : 1 ;
State = 12;
break;
}
return T_ERROR;
case 12:
if (isxdigit(Char)) {
Char = toupper(Char);
Exp = (Char > 'A' ) ? Char - 'A' + 10 : Char - '0';
State = 13;
break;
}
return T_ERROR;
case 13:
if (isxdigit(Char)) {
Char = toupper(Char);
Exp = (Exp << 4) + (Char > 'A' ) ? Char - 'A' + 10 : Char - '0';
break;
}
is.unget();
Lex_Float = Lex_Float * pow(2.0,ExpSign*Exp);
return LEX_FLOAT;
case 14:
if (isodigit(Char)) {
Lex_Int = (Lex_Int << 3 ) + Char - '0';
break;
}
is.unget();
return LEX_INT;
case 15:
if (Char == '=' ) {
return T_RDIV;
}
if (Char == '/') {
State = 16;
break;
}
if (Char == '*') {
State = 17;
break;
}
is.unget();
return T_DIV;
case 16:
if (Char == '\n') {
State = 0;
break;
}
if (Char == EOF) {
return T_ERROR;
}
break;
case 17:
if (Char == '*') {
State = 18;
break;
}
if (Char == EOF) {
return T_ERROR;
}
break;
case 18:
if (Char == '*') {
break;
}
if (Char == '/') {
State = 0;
break;
}
if (Char == EOF) {
return T_ERROR;
}
State = 17;
break;
case 20:
if (Char == '+')
return T_PLUSPLUS;
if (Char == '=')
return T_RPLUS;
is.unget();
return T_PLUS;
case 23:
if (Char == '-')
return T_MINUSMINUS;
if (Char == '=')
return T_RMINUS;
is.unget();
return T_MINUS;
case 26:
if (Char == '=')
return T_RVYKR;
is.unget();
return T_VYKR;
case 31:
if (Char == '=')
return T_RKRAT;
is.unget();
return T_KRAT;
case 33:
if (Char == '=')
return T_RMOD;
is.unget();
return T_MOD;
case 35:
if (Char == '<') {
State = 36;
break;
}
if (Char == '=')
return T_RLESS;
is.unget();
return T_LESS;
case 36:
if (Char == '=')
return T_RLESSLESS;
is.unget();
return T_LESSLESS;
case 39:
if (Char == '>') {
State = 40;
break;
}
if (Char == '=')
return T_RHIGH;
is.unget();
return T_HIGH;
case 40:
if (Char == '=')
return T_RHIGHHIGH;
is.unget();
return T_HIGHHIGH;
case 43:
if (Char == '=')
return T_RROV;
is.unget();
return T_ROV;
case 45:
if (Char == '&')
return T_AND;
if (Char == '=')
return T_RAMP;
is.unget();
return T_AMP;
case 48:
if (Char == '=')
return T_RXOR;
is.unget();
return T_XOR;
case 50:
if (Char == '=')
return T_RPIPE;
if (Char == '|')
return T_OR;
is.unget();
return T_PIPE;
}
}
}
to je vlastne stavovy automat
tohle je lexikalní analyzator volame porad dokola NextToken(); ten nam vraci typ toho co se načetlo a případně naplní v LexAnal třídě potřebná pole - pokud se načetla int hodnota, je naplněná Lex_Int
v další části si ukážeme syntaktickou analýzu vystavěnou na téhle lexikální