How to account for numbers/digits in file

My goal is to eventually read the query file, which contains docs, and then read the data file which also contains docs. Next, I need to get input from the user on which query doc (say number 2) that the user wants to use for querying against the docs in the doc file. I read in the query file ok because there are no numbers in the docs. But the data file has numbers in it, which is causing heartache. How do I account for numbers being in the docs. For instance, the 44 below in doc 2 is causing me issues.

Sample data file
------------------------------------
1
seven five one
2
three four nine fifteen
twelve 44 nineteen
3
two one
eleven
-----------------------------------

Explanation of data file: doc 1 is seven five one
doc 2 contains, three four nine fifteen twelve 44 nineteen
doc 3 is two one eleven


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <Windows.h>
#include <math.h>
#include "pch.h"
#include <cmath>
#include <cassert>
#include <fstream>
#include <sstream>
#include <stdio.h>
#include <string>
#include <algorithm>
#include <vector>
#include <cctype>
#include <iomanip>
#include <iostream>       // std::cout
#include <string>         // std::string
#include <cstddef>        // std::size_t

using namespace std;


const int MAX_STOPWORDS = 200;
const int MAX_TOKENS = 400;
const char* DEFAULT_STOPWORDS_FILENAME = "stopwords.txt";
const char* CHARS_TO_REMOVE = "12.,:;\"()";


class ContainerQ
{
	vector< vector<string> > values;
	int row = 0;

	void setRow(int r, int &query_nums)
	{
		row = r;
		cout << "r: " << r << " ";
		cout << "values.size(): " << values.size() << " ";
		if (r + 1 > values.size()) 
			values.resize(r + 1);
		cout <<"setRow: "<< query_nums << " ";
		cout << "original row value: " << row << " ";
		int get_query, q;
		get_query = query_nums;
		cout << "get_query: " << get_query << " ";
		if (query_nums = row) {
			cout << "query_nums = row: " << query_nums << " "<< row << endl;
		}
	}

	void addString(string str)
	{
		values[row].push_back(str);
		cout << "values[row].push_back(str): " << row << "    " << str << "    " << endl;
	}

public:
	void addItem(string str, int query_nums)
	{
		if (isdigit(str[0])) 
			setRow(stoi(str), query_nums);
		else                     
			addString(str);
		//cout << query_nums << " ";
	}
	/*
	void set_query_num(int query_nums, const Container &C) {
		int get_query = query_nums, q;
		if (get_query = row) {
			cout << "get_query = row: " << get_query << row;
		}
	}
	*/
	friend ostream & operator << (ostream &out, const ContainerQ &Q)
	{
		for (int i = 0; i < Q.values.size(); i++)
		{
			//std::remove(text.begin(), text.end(), isdigit());
			if (Q.values[i].size())
			{
				out << i << " ";
				for (string s : Q.values[i]) out << s << " ";
				out << '\n';
				cout << "Q.values[i]: " << Q.values[i].size() << "    ";
				//cout << "i: " << i << values[i] << "    ";
			}
		}

		return out;
	}
};

class ContainerD
{
	vector< vector<string> > values;
	int row = 0;

	void setRowD(int r)
	{
		row = r;
		if (r + 1 > values.size()) values.resize(r + 1);
		
	}

	void addStringD(string str)
	{
		values[row].push_back(str);
		cout << "values[row].push_back(str): " << row << "    " << str << "    " << endl;
	}

public:
	void addItemD(string str)
	{
		if (isdigit(str[0]))
			setRowD(stoi(str));
		else
			addStringD(str);
		//cout << query_nums << " ";
	}
	/*
	void set_query_num(int query_nums, const Container &C) {
		int get_query = query_nums, q;
		if (get_query = row) {
			cout << "get_query = row: " << get_query << row;
		}
	}
	*/
	friend ostream & operator << (ostream &out, const ContainerD &D)
	{
		for (int i = 0; i < D.values.size(); i++)
		{
			if (D.values[i].size())
			{
				out << i << " ";
				for (string s : D.values[i]) out << s << " ";
				out << '\n';
				cout << "D.values[i]: " << D.values[i].size() << "    ";
				//cout << "i: " << i << values[i] << "    ";
			}
		}

		return out;
	}
};
Something like this, perhaps (uses the standard regex library):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <iostream>
#include <string>
#include <regex>
#include <map>
#include <vector>
#include <sstream>
#include <iomanip>

// if the line contains just a single integer of up to five digits, it is a query number
bool is_query_number( const std::string& line )
{
    // beginning of string, zero or more white space, one to five decimal digits,
    // zero or more white space, end of string
    static const std::regex number_re( "^\\s*\\d{1,5}\\s*$" ) ;

    return std::regex_match( line, number_re ) ;
}

// return a vector of alphanumeric tokens in the line
std::vector<std::string> get_tokens( const std::string& line )
{
    static const std::regex words_re( "\\w++" ) ;

    std::vector<std::string> result ;

    std::sregex_iterator iter( line.begin(), line.end(), words_re ), end ;
    for( ; iter != end ; ++iter ) result.push_back( iter->str() ) ;

    return result ;
}

std::map< int, std::vector<std::string> > get_queries( std::istream& stm )
{
    std::map< int, std::vector<std::string> > result ;

    int query_number = 0 ; // default initial query number is zero

    std::string line ;
    while( std::getline( stm, line ) ) // for each line in the stream
    {
        // if this line contains a new query number, update query_number
        if( is_query_number(line) ) query_number = std::stoi(line) ;

        // otherwise add the tokens in the line to the current query
        else for( const auto& tok : get_tokens(line) ) result[query_number].push_back(tok) ;
    }

    return result ;
}

int main()
{
   // create some data for testing
   std::istringstream file( "1\n"
                            "seven five one\n"
                            " 2 \n"
                            " three four  nine fifteen \n"
                            "twelve 44 nineteen\n"
                            "3\n"
                            "two one\n"
                            "eleven\n"
                          ) ;

    for( const auto& [ query_number, tokens ] : get_queries(file) )
    {
        std::cout << "query #" << query_number << "  [ " ;
        for( const auto& tok : tokens ) std::cout << std::quoted(tok) << ' ' ;
        std::cout << "]\n\n" ;
    }
}

http://coliru.stacked-crooked.com/a/49e2df0e352856dc
trying that code...getting a few errors. Such as identifier tokens and query_number undefined.
trying that code...getting a few errors.

The code requires using C++17 or higher.
Modify main() to:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
int main()
{
   // create some data for testing
   std::istringstream file( "1\n"
                            "seven five one\n"
                            " 2 \n"
                            " three four  nine fifteen \n"
                            "twelve 44 nineteen\n"
                            "3\n"
                            "two one\n"
                            "eleven\n"
                          ) ;

    // for( const auto& [ query_number, tokens ] : get_queries(file) )
    for( const auto& pair : get_queries(file) ) // ***
    {
        const auto& query_number = pair.first ; // *** aded
        const auto& tokens = pair.second ;// *** added
        
        std::cout << "query #" << query_number << "  [ " ;
        
        //for( const auto& tok : tokens ) std::cout << std::quoted(tok) << ' ' ;
        for( const auto& tok : tokens ) std::cout << '"' << tok << "\" " ; // ***
        std::cout << "]\n\n" ;
    }
}

and it should work with C++11
(under project > Properties > C/C++ > Language > C++ Language Standard)...these are the only choices in visual studio community 2017:

ISO C++14 Standard (/std:c++14)
ISO C++17 Standard (/std:c++17)
The latest draft standard (/std:c++latest)

I selected: latest draft standard (/std:c++latest) since that seems to incorporate all the latest.

However, when I run the code with the updated main(), I get an unhandled exception from the xthrow.cpp file (which of course is not the file name I'm using. So not sure where to go from here. Any advice is definitely appreciated.
There is a typo on line number 22:

1
2
3
4
5
6
7
8
9
10
11
12
13
// return a vector of alphanumeric tokens in the line
std::vector<std::string> get_tokens( const std::string& line )
{
    // static const std::regex words_re( "\\w++" ) ; // line 22
    static const std::regex words_re( "\\w+" ) ; // *** corrected

    std::vector<std::string> result ;

    std::sregex_iterator iter( line.begin(), line.end(), words_re ), end ;
    for( ; iter != end ; ++iter ) result.push_back( iter->str() ) ;

    return result ;
}


Correct it and try to run it again.
With the online Microsoft compiler: https://rextester.com/RTWMI62553
Thanks for the fix to the regex expression, JLBorges, your example now works for VS2017 and 2019.
Yes, that worked. Thanks a bunch.
Topic archived. No new replies allowed.