rewrote lexer, still getting core dumped

closed account (Dy7SLyTq)
so i rewrote my lexer, which i needed to do anyways because the last one was written horribly and wasnt grabbing full names, but anyways, this rewrite is getting core dumped. whats wrong?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <regex.h>

using std::ostream;
using std::cout;
using std::endl;
using std::ifstream;
using std::string;
using std::vector;

class Token
{
     string Type, Name;
     int LineNo, LineCol;

     public:
          Token()
               : Type("UNINITIALIZED"), Name(""), LineNo(0), LineCol(-1) {}

          Token(string type, string name, int lineno, int linecol)
               : Type(type), Name(name), LineNo(lineno), LineCol(linecol) {}

          void SetType   (string type) { Type    = type;    }
          void SetName   (string name) { Name    = name;    }
          void SetLineNo (int lineno ) { LineNo  = lineno;  }
          void SetLineCol(int linecol) { LineCol = linecol; }

          string GetType   () { return Type;    }
          string GetName   () { return Name;    }
          int    GetLineNo () { return LineNo;  }
          int    GetLineCol() { return LineCol; }
};

ostream& operator<<(ostream &out, Token &token)
{
     out<<"("<< token.GetType() <<", "<< token.GetName() <<", "<< token.GetLineNo() <<", "<< token.GetLineCol() <<")";
     return out;
}

void ReadInFile(vector<string>&, ifstream&);
void Lex(string&, vector<Token>&, bool);

int main(int argc, char *argv[])
{
     ifstream File(argv[1]);
     vector<string> FileContents;

     ReadInFile(FileContents, File);

     vector<Token> TokenList;

     for(auto &Counter : FileContents)
          Lex(Counter, TokenList, true);

     for(auto &Counter : TokenList)
          cout<< Counter << endl;
}

void ReadInFile(vector<string> &FileContents, ifstream &File)
{
     string Line;

     while(getline(File, Line))
          FileContents.push_back(Line);
}

void Lex(string &Line, vector<Token> &TokenList, bool NewLine)
{
     static int LineNo = 1;
     int LineSize = Line.size();
     regex_t Regex;
     regmatch_t Match;

     regcomp(&Regex, "\"[^\"]+\"", REG_EXTENDED);
     if(regexec(&Regex, Line.c_str(), 1, &Match, 0) == 0)
          TokenList.push_back(Token("STRING", Line.substr(Match.rm_so, Match.rm_eo - Match.rm_so), LineNo, Match.rm_so));
     regfree(&Regex);

     regcomp(&Regex, "function", REG_EXTENDED);
     if(regexec(&Regex, Line.c_str(), 1, &Match, 0) == 0)
          TokenList.push_back(Token("KEYWORD", Line.substr(Match.rm_so, Match.rm_eo - Match.rm_so), LineNo, Match.rm_so));
     regfree(&Regex);

     regcomp(&Regex, "[_|A-Z|a-z]?[_|A-Z|a-z|0-9]+", REG_EXTENDED);
     if(regexec(&Regex, Line.c_str(), 1, &Match, 0) == 0)
          TokenList.push_back(Token("NAME", Line.substr(Match.rm_so, Match.rm_eo - Match.rm_so), LineNo, Match.rm_so));
     regfree(&Regex);

     regcomp(&Regex, "[0-9]+.?[0-9]+", REG_EXTENDED);
     if(regexec(&Regex, Line.c_str(), 1, &Match, 0) == 0)
          TokenList.push_back(Token("NUMBER", Line.substr(Match.rm_so, Match.rm_eo - Match.rm_so), LineNo, Match.rm_so));
     regfree(&Regex);

     if(Match.rm_eo != Line.size() - 1)
     {
          Line = Line.substr(Match.rm_eo, Line.size() - 1 - Match.rm_eo);
          Lex(Line, TokenList, false);
     }

     if(NewLine)
          LineNo++;
}
closed account (S6k9GNh0)
Have you stepped through it? Which line is causing the segfault/crash?
closed account (Dy7SLyTq)
i cant tell. looking over my source code i think its because the first line im throwing at it (import sysio;) isnt getting caught by the re's so its a problem at line 97 then. i dont have the ability to download a debugger right now and logging has failed me.
Just eyeballing, but aren't you losing a byte every time this line executes:

Line = Line.substr(Match.rm_eo, Line.size() - 1 - Match.rm_eo);

Maybe try it without the "-1".
closed account (Dy7SLyTq)
nope :L didnt work
Adding a little bit of output

1
2
3
4
void Lex(string &Line, vector<Token> &TokenList, bool NewLine)
{
    cout << "Lex : >>>" << Line << "<<<" << endl;
    cout << endl;	


and

1
2
3
4
5
6
7
8
9
     if(Match.rm_eo != Line.size() - 1)
     {
     	  cout << "Match.rm_eo = " << Match.rm_eo << endl;
          cout << "Line.size() - 1 = " << (Line.size() - 1) << endl;
          cout << "Line.size() - 1 - Match.rm_eo = " << (Line.size() - 1 - Match.rm_eo) << endl;
          cout << endl;
          Line = Line.substr(Match.rm_eo, Line.size() - 1 - Match.rm_eo);
          Lex(Line, TokenList, false);
     }


And then feeding your code with a file containing

"Hello, world!"


I get

Lex : >>>"Hello, world!"<<<

Match.rm_eo = 6
Line.size() - 1 = 12
Line.size() - 1 - Match.rm_eo = 6

Lex : >>> world!<<<

Match.rm_eo = 6
Line.size() - 1 = 5
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 2358520
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4292608775

terminate called after throwing an instance of 'std::out_of_range'
  what():  basic_string::substr

This application has requested the Runtime to terminate it in an unusual way.
Please contact the application's support team for more information.


The final value of Match.rm_eo looks suspiciously like uninit memory, so I inititlize Match (and Regex while I'm at it) to zero

1
2
     regex_t Regex = {0};
     regmatch_t Match = {0};


After which I get

Lex : >>>"Hello, world!"<<<

Match.rm_eo = 6
Line.size() - 1 = 14
Line.size() - 1 - Match.rm_eo = 8

Lex : >>>, world!<<<

(STRING, "Hello, world!", 1, 0)
(NAME, Hello, 1, 1)
(NAME, world, 1, 2)


But when I change the file to

3.1415927


the program goes into an infinite loop

Lex : >>>3.1415927<<<

Match.rm_eo = 9
Line.size() - 1 = 8
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 0
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 0
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 0
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4294967295

...


which carries on till the program blows up due to lack of stack space.

And

function add(var base)


also hits an infinite loop

Lex : >>>function add(var base)<<<

Match.rm_eo = 8
Line.size() - 1 = 21
Line.size() - 1 - Match.rm_eo = 13

Lex : >>> add(var base<<<

Match.rm_eo = 4
Line.size() - 1 = 12
Line.size() - 1 - Match.rm_eo = 8

Lex : >>>(var bas<<<

Match.rm_eo = 4
Line.size() - 1 = 7
Line.size() - 1 - Match.rm_eo = 3

Lex : >>> ba<<<

Match.rm_eo = 3
Line.size() - 1 = 2
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 0
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4294967295

Lex : >>><<<

Match.rm_eo = 0
Line.size() - 1 = 4294967295
Line.size() - 1 - Match.rm_eo = 4294967295

...


I think you need to look at your temination condition and with dealing with the situation where multiple matched are possible (or do you want "Hello, world!" to generate 3 tokens rather just the one, "STRING" token?)

And the - 1 does look suspect.

Andy
Last edited on
closed account (Dy7SLyTq)
i would like "Hello, world!" to just be one string token. i took out the - 1 but so far it hasnt made a difference. what would you suggest i do to fix this?
When you say it hasn't made a difference, do you mean it still crashes? Have you initialized the variables I pointed out?

To get your code to see "Hello, world!" as just a string, you could keep track of where the matches for the different patterns were found: the earliest and longest being the winner (the only token added to TokenList.)

As it stands, the code does still eventually crash with "Hello, world!", when the call stack gets deep enough, because the final !" is never handled. The first call to Lex matches first "Hello, world!" and then Hello, with the latter match being used to work out how to call Lex the second time: with , world!". This matches world and then calls Lex with just !". As this doesn't match anything, it calls Lex with !" again and again and ...

So I think you also need to handle the case when nothing is matched better.

Not sure this is the most elegant route, but it should work with your existing regex strings. If you do go this route, it might make sense to make your Lex function table driven.

Andy
Last edited on
closed account (Dy7SLyTq)
i meant that when the - 1 was suggested to be taken out i did and it still crashed. i realized what my problem was though.i do need to initialize my variables, but what i need to do is have it lex immedaitly after finding a match and pushing back the token
Topic archived. No new replies allowed.