changing a classification code

Hello, I found this code on the internet (https://programmersought.com/article/52001300562/). I managed to correct the errors that prevented me from compiling it. Now I wish I could import the training data from a file and test a value from the program but I can't figure out how the program separates the columns from the training data. Can you help me?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <bits/stdc++.h>
using namespace std;

/*test sample data 
1.5 1
1.92 3
1.7 2
1.73 2
1.6 1
1.75 2
1.5 1
1.6 1
2.05 3
1.9 3
1.68 2
1.78 2
1.70 2
1.68 2
1.65 2
 Test pending data
1.5
 Result
1 
*/

struct Node
{
	vector<double> v;
	int re;
}
board[10005];

struct cmp
{
	bool operator()(const pair<double, int> a, const pair<double, int> b)
	{
		return a.first < b.first;
	}
};

priority_queue<pair<double, int>, std::vector< pair<double, int> >, cmp> Q;
double getLen(int dim, int num1, int num2)
{
	//Get Euclidean distance 
	double re = 0;
	for (int i = 0; i < dim; ++i)
		re += pow(board[num1].v[i] - board[num2].v[i], 2);
	return sqrt(re);
}

int getRe()
{
	//Based on the data in Q to get the result 
	map<int, int> book;
	map<int, int>::iterator it;
	while (!Q.empty())
	{
		int t = Q.top().second;
		Q.pop();
		if (book.find(t) != book.end()) ++book[t];
		else book[t] = 1;
	}
	int num = 0, re;
	for (it = book.begin(); it != book.end(); ++it)
	{
		if (it->second > num)
		{
			num = it->second;
			re = it->first;
		}
	}
	return re;
}

int main()
{

	int dim;	//dimensions
	int K;
	cout << "Please enter the dimensions of the sample data:" << endl;
	cin >> dim;
	cout << "Please enter a K value:" << endl;
	cin >> K;
	cout << "Please enter sample data (input -1 end data input):" << endl;
	int top = 0, num = 0;
	double t;
	while (cin >> t && t != -1)
	{
		if (num == dim)
		{
			board[top].re = (int) t;
			num = 0;
			++top;
		}
		else
		{
			board[top].v.push_back(t);
			++num;
		}
	}
	while (true)
	{
		cout << "Please enter the pending data (input -1 end program):" << endl;
		while (!Q.empty()) Q.pop();
		vector<double> ().swap(board[top].v);	//Clear the vector and minimize its capacity 
		for (int i = 0; i < dim; ++i)
		{
			double t;
			cin >> t;
			if (t == -1) return 0;	//End the program 
			board[top].v.push_back(t);
		}
		for (int i = 0; i < K && i < top; ++i)
		{
			Q.push(make_pair(getLen(dim, i, top), board[i].re));
		}
		if (top <= K) cout << "The result is: " << getRe() << endl;
		else
		{
			for (int i = K; i < top; ++i)
			{
				double t = getLen(dim, i, top);
				if (t < Q.top().first)
				{
					Q.pop();
					Q.push(make_pair(t, board[i].re));
				}
			}
			cout << "The result is:" << getRe() << endl;
		}
	}

	return 0;
}
Last edited on
I want to read the file where the data is separated by a comma and use a function such as this one:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
split( vector<string> & theStringVector,  /* Altered/returned value */
       const  string  & theString,
       const  string  & theDelimiter)
{
    

    size_t  start = 0, end = 0;

    while ( end != string::npos)
    {
        end = theString.find( theDelimiter, start);

        // If at end, use length=maxLength.  Else use length=end-start.
        theStringVector.push_back( theString.substr( start,
                       (end == string::npos) ? string::npos : end - start));

        // If at end, use start=maxSize.  Else use start=end+delimiter.
        start = (   ( end > (string::npos - theDelimiter.size()) )
                  ?  string::npos  :  end + theDelimiter.size());
    }
}


But I don't understand the entire data processing of the program (classifier) to be able to modify it.
Last edited on
Well did you read all the links in that link you posted, or did you just grab the code?

Oh, great master.
With the amount of code that doesn't work on the internet, it's rare to be able to simply copy and paste. Finally, I don't claim to have written the code and I quote my source. The problem is that I don't know all the functions (priority_queue) I can use to modify it correctly. So I ask for advice. Maybe you need some advice to search on the internet ? And if it's just to make yourself look better by humiliating the first person you meet, I hope your little intervention has relieved you. But that doesn't make you a better person...
Last edited on
Try the code the way it was originally written:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118


    #include <bits/stdc++.h>
     
    using namespace std;
     
    /* test sample data 
    1.5 1
    1.92 3
    1.7 2
    1.73 2
    1.6 1
    1.75 2
    1.5 1
    1.6 1
    2.05 3
    1.9 3
    1.68 2
    1.78 2
    1.70 2
    1.68 2
    1.65 2
     Test pending data
    1.5
     Result
    1 
    */
     
    struct Node{
    	vector<double> v;
    	int re;
    }board[10005];
     
    struct cmp{
    	bool operator()(const pair<double,int> a,const pair<double,int> b){
    		return a.first < b.first;
    	}
    };
     

     Priority_queue<pair<double,int>,vector<pair<double,int> >,cmp> Q;//Maintain K nearest neighbors
     
     Double getLen(int dim,int num1,int num2){//Get Euclidean distance 
    	double re = 0;
    	for(int i=0 ; i<dim ; ++i)
    		re += pow(board[num1].v[i]-board[num2].v[i],2);
    	return sqrt(re);
    }
     
     Int getRe(){//Based on the data in Q to get the result 
    	map<int,int> book;
    	map<int,int>::iterator it;
    	while(!Q.empty()){
    		int t = Q.top().second;
    		Q.pop();
    		if(book.find(t) != book.end())++book[t];
    		else book[t] = 1;
    	}
    	int num = 0,re;
    	for(it=book.begin() ; it!=book.end() ; ++it){
    		if(it->second > num){
    			num = it->second;
    			re = it->first;
    		}
    	}
    	return re;
    }
     
    int main(){
    	
    	 Int dim;//dimensions
    	int K; 
    	 Cout << "Please enter the dimensions of the sample data:" << endl;
    	cin >> dim;
    	 Cout << "Please enter a K value:" << endl;
    	cin >> K; 
    	 Cout << "Please enter sample data (input -1 end data input):" << endl;
    	int top = 0,num = 0;
    	double t;
    	while(cin >> t && t != -1){
    		if(num == dim){
    			board[top].re = (int)t;
    			num = 0;
    			++top;
    		}
    		else {
    			board[top].v.push_back(t);
    			++num;
    		}
    	}
    	while(true){
    		 Cout << "Please enter the pending data (input -1 end program):" << endl;
    		while(!Q.empty())Q.pop();
    		 Vector<double>().swap(board[top].v);//Clear the vector and minimize its capacity 
    		for(int i=0 ; i<dim ; ++i){
    			double t;
    			cin >> t;
    			 If(t == -1)return 0;//End the program 
    			board[top].v.push_back(t);
    		}
    		for(int i=0 ; i<K && i<top; ++i){
    			Q.push(make_pair(getLen(dim,i,top),board[i].re));
    		}
    		 If(top <= K)cout << "The result is: " << getRe() << endl;
    		else {
    			for(int i=K ; i<top ; ++i){
    				double t = getLen(dim,i,top);
    				if(t < Q.top().first){
    					Q.pop();
    					Q.push(make_pair(t,board[i].re));
    				} 
    			}
    			 Cout << "The result is:" << getRe() << endl;
    		} 
    	}
    	
    	return 0;
    } 


It doesn't work !

humility is but truth, pride is but a lie.
Last edited on
I've found another classification code that I can't modify for the use I want, can you help me oh, great master:
source : https://medium.com/@dr.sunhongyu/machine-learning-c-naive-bayes-classifier-example-dbe7b88a999b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#include <iostream>
#include <string>
#include <algorithm>
#include <bits/stdc++.h>
#include <stdlib.h>
#include <sstream>
#include <ctime>
#include <vector>
using namespace std;
class NaiveBayesClassifer
{
	private:
		//<class id, class probility ><C, P(C)>
		unordered_map<int, double> classes;
	//<class id, < attribute id, probability>><C,<x, P(x|C)>>
	unordered_map<int, unordered_map<int, double>> attributesPerClass;
	public:
		// input: vector<pair < class id, attribute id>>, DimSize is the number of attributes
		NaiveBayesClassifer(vector<vector < int>> &data, int DimSize)
		{
			// start training
			// count all classes and attributes
			for (auto entry: data)
			{
				if (classes.find(entry[0]) == classes.end())
				{
					classes[entry[0]] = 1;
					unordered_map<int, double> pxc;
					attributesPerClass[entry[0]] = pxc;
				}
				else
				{
					classes[entry[0]] += 1;
				}

				for (int k = 1; k <= DimSize; k++)
				{
					if (attributesPerClass[entry[0]].find(entry[k]) == attributesPerClass[entry[0]].end())
					{
						attributesPerClass[entry[0]][entry[k]] = 1;
					}
					else
					{
						attributesPerClass[entry[0]][entry[k]] += 1;
					}
				}
			}

			// calculate probility per class and per attribute
			for (auto seg: attributesPerClass)
			{
				cout << " --- Class " << seg.first << " --- " << endl;
				for (auto entry: seg.second)
				{
					entry.second /= classes[seg.first];
					cout << "Attribute P(x=" << entry.first << "| C=" << seg.first << ") = " << entry.second << endl;
				}

				classes[seg.first] /= data.size();
				cout << "Class P(C=" << seg.first << ") = " << classes[seg.first] << endl;
			}
		}

	// predict class with attributes vector < attribute id>
	int predict(vector<int> attributes)
	{
		int maxcid = -1;
		double maxp = 0;
		for (auto cls: classes)
		{
			// p(C|x) = p(C)*p(x1|C)*p(x2|C)*…
			double pCx = cls.second;
			for (int i = 0; i < attributes.size(); i++)
			{
				pCx *= attributesPerClass[cls.first][attributes[i]];
			}

			if (pCx > maxp)
			{
				maxp = pCx;
				maxcid = cls.first;
			}
		}

		cout << "Predict Class: " << maxcid << " P(C|x) = " << maxp << endl;
		return maxcid;
	}
};
void populateData(vector<vector < int>> &data, unordered_map< string, int> &classmap, unordered_map< string, int> &attrimap,
	string c, string a1, string a2, int K)
{
	vector<int> apair = { classmap[c], attrimap[a1], attrimap[a2]
	};
	vector<vector < int>> newarr(K, apair);
	data.insert(data.end(), newarr.begin(), newarr.end());
}

int main()
{
	// prepare a training dataset with 2 attributes and 3 classes
	unordered_map<string, int> classmap = {
		{
			"apple", 0
		},
		{
			"pineapple", 1
		},
		{
			"cherry", 2
		}
	};
	unordered_map<string, int> attrimap =
		// color
		{
			{
				"red", 0
			},
			{
				"green", 1
			},
			{
				"yellow", 2
			},
			// shape
			{
				"round", 10
			},
			{
				"oval", 11
			},
			{
				"heart", 12
			}
		};
	vector<vector < int>> data;
	populateData(data, classmap, attrimap, "apple", "green", "round", 20);
	populateData(data, classmap, attrimap, "apple", "red", "round", 50);
	populateData(data, classmap, attrimap, "apple", "yellow", "round", 10);
	populateData(data, classmap, attrimap, "apple", "red", "oval", 5);
	populateData(data, classmap, attrimap, "apple", "red", "heart", 5);
	populateData(data, classmap, attrimap, "pineapple", "green", "oval", 30);
	populateData(data, classmap, attrimap, "pineapple", "yellow", "oval", 70);
	populateData(data, classmap, attrimap, "pineapple", "green", "round", 5);
	populateData(data, classmap, attrimap, "pineapple", "yellow", "round", 5);
	populateData(data, classmap, attrimap, "cherry", "yellow", "heart", 50);
	populateData(data, classmap, attrimap, "cherry", "red", "heart", 70);
	populateData(data, classmap, attrimap, "cherry", "yellow", "round", 5);
	random_shuffle(data.begin(), data.end());
	// train model
	NaiveBayesClassifer mymodel(data, 2);
	// predict with model
	int cls = mymodel.predict({ attrimap["red"], attrimap["heart"] });
	cout << "Predicted class " << cls << endl;
	return 0;
}
I'll manage on my own. Thanks anyway.
So here's the training data must be placed in the training.txt file. The program retrieves the data and uses it for training.

training data as in the example of the source site (https://programmersought.com/article/52001300562/)
to be placed in training.txt :
1.5,1
1.92,3
1.7,2
1.73,2
1.6,1
1.75,2
1.5,1
1.6,1
2.05,3
1.9,3
1.68,2
1.78,2
1.70,2
1.68,2
1.65,2

this is the code that I modified thanks to jlb's precious help :

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#include <bits/stdc++.h>
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
using namespace std;

struct Node
{
	vector<double> v;
	int re;
}

board[10005];

struct cmp
{
	bool operator()(const pair<double, int> a, const pair<double, int> b)
	{
		return a.first < b.first;
	}
};

priority_queue<pair<double, int>, std::vector< pair<double, int> >, cmp> Q;

double getLen(int dim, int num1, int num2)
{
	//Get Euclidean distance 
	double re = 0;
	for (int i = 0; i < dim; ++i)
		re += pow(board[num1].v[i] - board[num2].v[i], 2);
	return sqrt(re);
}

int getRe()
{
	//Based on the data in Q to get the result 
	map<int, int> book;
	map<int, int>::iterator it;
	while (!Q.empty())
	{
		int t = Q.top().second;
		Q.pop();
		if (book.find(t) != book.end()) ++book[t];
		else book[t] = 1;
	}

	int num = 0, re;
	for (it = book.begin(); it != book.end(); ++it)
	{
		if (it->second > num)
		{
			num = it->second;
			re = it->first;
		}
	}

	return re;
}

int main()
{
	int dim;	//dimensions
	int K;
	cout << "Please enter the dimensions of the sample data:" << endl;
	cin >> dim;
	cout << "Please enter a K value:" << endl;
	cin >> K;

	int top = 0, num = 0;
	double t;
	vector<vector < string>> data;
	ifstream infile("training.txt");
	string line;
	string str;

	//  Read the file    
	while (getline(infile, line))
	{
		istringstream ss(line);
		vector<string> record;

		while (getline(ss, str, ','))
			record.push_back(str);
		data.push_back(record);
	}

	//  comma-separated data recovery
	for (size_t i = 0; i < data.size(); i++)
	{
		vector<string> record;

		record = data[i];
		//  data addition
		for (int j = 0; j < record.size(); j++)
		{
			cout << record[j] << "\n";
			double t = atoi(record.at(j).c_str());
			if (num == dim)
			{
				board[top].re = (int) t;
				num = 0;
				++top;
			}
			else
			{
				board[top].v.push_back(t);
				++num;
			}
		}
	}

	while (true)
	{
		cout << "Please enter the pending data (input -1 end program):" << endl;
		while (!Q.empty()) Q.pop();
		vector<double> ().swap(board[top].v);	//Clear the vector and minimize its capacity 
		for (int i = 0; i < dim; ++i)
		{
			double t;
			cin >> t;
			if (t == -1) return 0;	//End the program 
			board[top].v.push_back(t);
		}

		for (int i = 0; i < K && i < top; ++i)
		{
			Q.push(make_pair(getLen(dim, i, top), board[i].re));
		}

		if (top <= K) cout << "The result is: " << getRe() << endl;
		else
		{
			for (int i = K; i < top; ++i)
			{
				double t = getLen(dim, i, top);
				if (t < Q.top().first)
				{
					Q.pop();
					Q.push(make_pair(t, board[i].re));
				}
			}

			cout << "The result is:" << getRe() << endl;
		}
	}

	return 0;
}


On line 119 to request a line to classify the number of data to be entered should be equal to dim-1 and not a dim? If I have a table of dimension 2 for training the first column is the data and the second is the label ?
Wouldn't it be fairer to do this:
1
2
3
4
5
6
7
8
9

     for (int i = 1; i < dim; ++i)
		{
			double t;
			cin >> t;
			if (t == -1) return 0;	//End the program 
			board[top].v.push_back(t);
		}
Last edited on
This runs. It's anybody's guess what it is supposed to do other than demonstrate how not to plan, design, write and construct a program.

Sure this is not a hoax?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <queue>
#include <map>

using namespace std;

struct Node
{
    vector<double> v;
    int re;
}

board[10005];

struct cmp
{
    bool operator()(const pair<double, int> a, const pair<double, int> b)
    {
        return a.first < b.first;
    }
};

priority_queue<pair<double, int>, std::vector< pair<double, int> >, cmp> Q;

double getLen(int dim, int num1, int num2)
{
    //Get Euclidean distance
    double re = 0;
    for (int i = 0; i < dim; ++i)
    re += pow(board[num1].v[i] - board[num2].v[i], 2);
    return sqrt(re);
}

int getRe()
{
    //Based on the data in Q to get the result
    map<int, int> book;
    map<int, int>::iterator it;
    while (!Q.empty())
    {
        int t = Q.top().second;
        Q.pop();
        if (book.find(t) != book.end()) ++book[t];
        else book[t] = 1;
    }
    
    int num = 0, re;
    for (it = book.begin(); it != book.end(); ++it)
    {
        if (it->second > num)
        {
            num = it->second;
            re = it->first;
        }
    }
    
    return re;
}

int main()
{
    int dim;    //dimensions
    int K;
    cout << "Please enter the dimensions of the sample data:" << endl;
    cin >> dim;
    cout << "Please enter a K value:" << endl;
    cin >> K;
    
    int top = 0, num = 0;
    double t;
    vector<vector < string>> data;
    ifstream infile("training.txt");
    string line;
    string str;
    
    //  Read the file
    while (getline(infile, line))
    {
        istringstream ss(line);
        vector<string> record;
        
        while (getline(ss, str, ','))
            record.push_back(str);
        data.push_back(record);
    }
    
    //  comma-separated data recovery
    for (size_t i = 0; i < data.size(); i++)
    {
        vector<string> record;
        
        record = data[i];
        //  data addition
        for (int j = 0; j < record.size(); j++)
        {
            cout << record[j] << "\n";
            double t = atoi(record.at(j).c_str());
            if (num == dim)
            {
                board[top].re = (int) t;
                num = 0;
                ++top;
            }
            else
            {
                board[top].v.push_back(t);
                ++num;
            }
        }
    }
    
    while (true)
    {
        cout << "Please enter the pending data (input -1 end program):" << endl;
        while (!Q.empty()) Q.pop();
        vector<double> ().swap(board[top].v);    //Clear the vector and minimize its capacity
        for (int i = 0; i < dim; ++i)
        {
            double t;
            cin >> t;
            if (t == -1) return 0;    //End the program
            board[top].v.push_back(t);
        }
        
        for (int i = 0; i < K && i < top; ++i)
        {
            Q.push(make_pair(getLen(dim, i, top), board[i].re));
        }
        
        if (top <= K) cout << "The result is: " << getRe() << endl;
        else
        {
            for (int i = K; i < top; ++i)
            {
                double t = getLen(dim, i, top);
                if (t < Q.top().first)
                {
                    Q.pop();
                    Q.push(make_pair(t, board[i].re));
                }
            }
            
            cout << "The result is:" << getRe() << endl;
        }
    }
    
    return 0;
}


Please enter the dimensions of the sample data:
3
Please enter a K value:
5
1.5
1
1.92
3
1.7
2
1.73
2
1.6
1
1.75
2
1.5
1
1.6
1
2.05
3
1.9
3
1.68
2
1.78
2
1.70
2
1.68
2
1.65
2
Please enter the pending data (input -1 end program):
2
1
-1
Program ended with exit code: 0
What you tried is the version I modified: hence the topic marked as resolved. But all is well, thank you!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <map>
using namespace std;

using Attribute = pair<string,string>;
istream & operator >> ( istream &in, Attribute &a ) { return in >> a.first >> a.second; }
ostream & operator << ( ostream &out, const Attribute &a ) { return out << a.first << " " << a.second; }


template<typename L, typename A> class NaiveBayes     // L: label, A: attributes
{
   map< pair<L,A>, double > data;                     // raw training data
   map< L, double > labelSums;
   map< A, double > attributeSums;
   double sumData;

public:
   void insert( L label, A attrib, double d ) { data[{label,attrib}] += d; }

   void readData( istream &in )
   {
      L label;
      A attrib;
      double d;
      while ( in >> label >> attrib >> d ) insert( label, attrib, d );
   }
   
   void calcSums()
   {
      labelSums.clear();   attributeSums.clear();   sumData = 0.0;
      for ( auto e : data )
      {
         labelSums    [e.first.first ] += e.second;
         attributeSums[e.first.second] += e.second;
         sumData += e.second;
      }
   }
   
   double getLabelProbability( L label )
   {
      auto it = labelSums.find( label );
      return it == labelSums.end() ? 0 : it->second / sumData;
   }

   double getAttributeProbability( A attrib )
   {
      auto it = attributeSums.find( attrib );
      return it == attributeSums.end() ? 0 : it->second / sumData;
   }

   double getConditionalProbabilityAgivenL( A attrib, L label )
   {
      auto it = data.find( {label, attrib} );
      return it == data.end() ? 0 : it->second / labelSums[label];
   }

   double getConditionalProbabilityLgivenA( L label, A attrib )
   {
      auto it = data.find( {label, attrib} );
      return it == data.end() ? 0 : it->second / attributeSums[attrib];
   }

   void summary()
   {
      for ( auto e : labelSums )
      {
         L label = e.first;
         cout << "P(" << label << ") = " << getLabelProbability( label ) << '\n';
         for ( auto f : attributeSums )
         {
            A attrib = f.first;
            cout << "   " << "P(" << label << "|" << attrib << ") = " << getConditionalProbabilityLgivenA( label, attrib ) << '\n';
         }
      }
      cout << "\n";

      for ( auto f : attributeSums )
      {
         A attrib = f.first;
         cout << "P(" << attrib << ") = " << getAttributeProbability( attrib ) << '\n';
         for ( auto e : labelSums )
         {
            L label = e.first;
            cout << "   " << "P(" << attrib << "|" << label << ") = " << getConditionalProbabilityAgivenL( attrib, label ) << '\n';
         }
      }
      cout << "\n";
   }

   L predictLabel( A attrib )
   {
      L result{};
      auto it = attributeSums.find( attrib );
      if ( it == attributeSums.end() ) return result;

      double maxProbability = -1;
      for ( auto e : data )
      {
         if ( e.first.second == attrib )
         {
            L label = e.first.first;
            double p = getConditionalProbabilityLgivenA( label, attrib );
            if ( p > maxProbability )
            {
               maxProbability = p;
               result = label;
            }
         }
      }
      return result;
   }
};


int main()
{
// ifstream in( "data.txt" );
   istringstream in( "apple  green  round 20\n"
                     "apple  red  round 50\n"
                     "apple  yellow  round 10\n"
                     "apple  red  oval 5\n"
                     "apple  red  heart 5\n"
                     "pineapple  green  oval 30\n"
                     "pineapple  yellow  oval 70\n"
                     "pineapple  green  round 5\n"
                     "pineapple  yellow  round 5\n"
                     "cherry  yellow  heart 50\n"
                     "cherry  red  heart 70\n"
                     "cherry  yellow  round 5\n" );

   NaiveBayes<string,Attribute> B;
   B.readData( in );
   B.calcSums();
   B.summary();

   Attribute attrib;   attrib = { "red", "heart" };
   cout << "Predictor for " << attrib << " is " << B.predictLabel( attrib ) << '\n';
}


P(apple) = 0.276923
   P(apple|green oval) = 0
   P(apple|green round) = 0.8
   P(apple|red heart) = 0.0666667
   P(apple|red oval) = 1
   P(apple|red round) = 1
   P(apple|yellow heart) = 0
   P(apple|yellow oval) = 0
   P(apple|yellow round) = 0.5
P(cherry) = 0.384615
   P(cherry|green oval) = 0
   P(cherry|green round) = 0
   P(cherry|red heart) = 0.933333
   P(cherry|red oval) = 0
   P(cherry|red round) = 0
   P(cherry|yellow heart) = 1
   P(cherry|yellow oval) = 0
   P(cherry|yellow round) = 0.25
P(pineapple) = 0.338462
   P(pineapple|green oval) = 1
   P(pineapple|green round) = 0.2
   P(pineapple|red heart) = 0
   P(pineapple|red oval) = 0
   P(pineapple|red round) = 0
   P(pineapple|yellow heart) = 0
   P(pineapple|yellow oval) = 1
   P(pineapple|yellow round) = 0.25

P(green oval) = 0.0923077
   P(green oval|apple) = 0
   P(green oval|cherry) = 0
   P(green oval|pineapple) = 0.272727
P(green round) = 0.0769231
   P(green round|apple) = 0.222222
   P(green round|cherry) = 0
   P(green round|pineapple) = 0.0454545
P(red heart) = 0.230769
   P(red heart|apple) = 0.0555556
   P(red heart|cherry) = 0.56
   P(red heart|pineapple) = 0
P(red oval) = 0.0153846
   P(red oval|apple) = 0.0555556
   P(red oval|cherry) = 0
   P(red oval|pineapple) = 0
P(red round) = 0.153846
   P(red round|apple) = 0.555556
   P(red round|cherry) = 0
   P(red round|pineapple) = 0
P(yellow heart) = 0.153846
   P(yellow heart|apple) = 0
   P(yellow heart|cherry) = 0.4
   P(yellow heart|pineapple) = 0
P(yellow oval) = 0.215385
   P(yellow oval|apple) = 0
   P(yellow oval|cherry) = 0
   P(yellow oval|pineapple) = 0.636364
P(yellow round) = 0.0615385
   P(yellow round|apple) = 0.111111
   P(yellow round|cherry) = 0.04
   P(yellow round|pineapple) = 0.0454545

Predictor for red heart is cherry

Last edited on
Hello, excellent, thank you very much! Honestly, it's great.
Hello, I've been looking for a way to read large files faster and I found a piece of code that I put in place of the proposed code (which is perfectly suitable and for which I thank Lastchance again). But I would like to know if it is indeed faster, can you tell me what you think about it ?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <map>
using namespace std;
using Attribute = pair<string, string> ;
istream &operator>>(istream &in, Attribute &a)
{
	return in >> a.first >> a.second;
}

ostream &operator<<(ostream &out, const Attribute &a)
{
	return out << a.first << " " << a.second;
}

template < typename L, typename A > class NaiveBayes	// L: label, A: attributes
{
	map<pair<L, A>, double> data;	// raw training data
	map<L, double> labelSums;
	map<A, double> attributeSums;
	double sumData;

	public:
		void insert(L label, A attrib, double d)
		{
			data[
			{
				label, attrib
			}] += d;
		}

	void readData(istream & in)
	{
		L label;
		A attrib;
		double d;
		while (in >> label >> attrib >> d) insert(label, attrib, d);
	}

	void calcSums()
	{
		labelSums.clear();
		attributeSums.clear();
		sumData = 0.0;
		for (auto e: data)
		{
			labelSums[e.first.first] += e.second;
			attributeSums[e.first.second] += e.second;
			sumData += e.second;
		}
	}

	double getLabelProbability(L label)
	{
		auto it = labelSums.find(label);
		return it == labelSums.end() ? 0 : it->second / sumData;
	}

	double getAttributeProbability(A attrib)
	{
		auto it = attributeSums.find(attrib);
		return it == attributeSums.end() ? 0 : it->second / sumData;
	}

	double getConditionalProbabilityAgivenL(A attrib, L label)
	{
		auto it = data.find({ label, attrib });
		return it == data.end() ? 0 : it->second / labelSums[label];
	}

	double getConditionalProbabilityLgivenA(L label, A attrib)
	{
		auto it = data.find({ label, attrib });
		return it == data.end() ? 0 : it->second / attributeSums[attrib];
	}

	void summary()
	{
		for (auto e: labelSums)
		{
			L label = e.first;
			cout << "P(" << label << ") = " << getLabelProbability(label) << '\n';
			for (auto f: attributeSums)
			{
				A attrib = f.first;
				cout << "   " << "P(" << label << "|" << attrib << ") = " << getConditionalProbabilityLgivenA(label, attrib) << '\n';
			}
		}

		cout << "\n";

		for (auto f: attributeSums)
		{
			A attrib = f.first;
			cout << "P(" << attrib << ") = " << getAttributeProbability(attrib) << '\n';
			for (auto e: labelSums)
			{
				L label = e.first;
				cout << "   " << "P(" << attrib << "|" << label << ") = " << getConditionalProbabilityAgivenL(attrib, label) << '\n';
			}
		}

		cout << "\n";
	}

	L predictLabel(A attrib)
	{
		L result {};
		auto it = attributeSums.find(attrib);
		if (it == attributeSums.end()) return result;

		double maxProbability = -1;
		for (auto e: data)
		{
			if (e.first.second == attrib)
			{
				L label = e.first.first;
				double p = getConditionalProbabilityLgivenA(label, attrib);
				if (p > maxProbability)
				{
					maxProbability = p;
					result = label;
				}
			}
		}

		return result;
	}
};

int main()
{
	//Start opening your file
	ifstream inBigArrayfile;
	inBigArrayfile.open("data.txt", std::ios::binary | std::ios:: in);

	//Find length of file
	inBigArrayfile.seekg(0, std::ios::end);
	long Length = inBigArrayfile.tellg();
	inBigArrayfile.seekg(0, std::ios::beg);

	//read in the data from your file
	char *InFileData = new char[Length];
	inBigArrayfile.read(InFileData, Length);

	//cout << InFileData << endl;
	istringstream in (InFileData);
	/*
	   istringstream in("apple  green  round 20\n"
	                     "apple  red  round 50\n"
	                     "apple  yellow  round 10\n"
	                     "apple  red  oval 5\n"
	                     "apple  red  heart 5\n"
	                     "pineapple  green  oval 30\n"
	                     "pineapple  yellow  oval 70\n"
	                     "pineapple  green  round 5\n"
	                     "pineapple  yellow  round 5\n"
	                     "cherry  yellow  heart 50\n"
	                     "cherry  red  heart 70\n"
	                     "cherry  yellow  round 5\n");
	*/
	NaiveBayes<string, Attribute> B;
	B.readData(in);
	B.calcSums();
	B.summary();

	Attribute attrib;
	attrib = { "word", "cpp" };
	cout << "Predictor for " << attrib << " is " << B.predictLabel(attrib) << '\n';

	//Clean up
	delete[] InFileData;

	return 0;
}
Last edited on
there are a couple of (yearish old?) posts that went deep on opening big files fast, talking about memory mapped files and with some comparisons of various tricks to try to make it faster. See if you can find those with the search tool? There are a lot of considerations in those topics.

Reading it into one large chunk is pretty good. Its not the best, but its not the worst either, and its simple to code that. You are limited to how much ram you have free in a solid block, though, and that may or may not be enough for some tasks. On a modern computer, a large file, to me, is multiple GB. I don't start to worry about it too much under 5GB (total over all files open at once).
Last edited on
Hello I found this solution to read the file faster according to stackoverflow.com. The function of the line 10 to 21.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <iostream>
#include <string>
#include <fstream>
#include <sstream>
#include <map>
#include <vector>
using namespace std;
using Attribute = pair<string, string> ;

string readFile2(const string &fileName)
{
	ifstream ifs(fileName.c_str(), ios:: in | ios::binary | ios::ate);

	ifstream::pos_type fileSize = ifs.tellg();
	ifs.seekg(0, ios::beg);

	vector<char> bytes(fileSize);
	ifs.read(&bytes[0], fileSize);

	return string(&bytes[0], fileSize);
}

istream &operator>>(istream &in, Attribute &a)
{
	return in >> a.first >> a.second;
}

ostream &operator<<(ostream &out, const Attribute &a)
{
	return out << a.first << " " << a.second;
}

template < typename L, typename A > class NaiveBayes	// L: label, A: attributes
{
	map<pair<L, A>, double> data;	// raw training data
	map<L, double> labelSums;
	map<A, double> attributeSums;
	double sumData;

	public:
		void insert(L label, A attrib, double d)
		{
			data[
			{
				label, attrib
			}] += d;
		}

	void readData(istream & in)
	{
		L label;
		A attrib;
		double d;
		while (in >> label >> attrib >> d) insert(label, attrib, d);
	}

	void calcSums()
	{
		labelSums.clear();
		attributeSums.clear();
		sumData = 0.0;
		for (auto e: data)
		{
			labelSums[e.first.first] += e.second;
			attributeSums[e.first.second] += e.second;
			sumData += e.second;
		}
	}

	double getLabelProbability(L label)
	{
		auto it = labelSums.find(label);
		return it == labelSums.end() ? 0 : it->second / sumData;
	}

	double getAttributeProbability(A attrib)
	{
		auto it = attributeSums.find(attrib);
		return it == attributeSums.end() ? 0 : it->second / sumData;
	}

	double getConditionalProbabilityAgivenL(A attrib, L label)
	{
		auto it = data.find({ label, attrib });
		return it == data.end() ? 0 : it->second / labelSums[label];
	}

	double getConditionalProbabilityLgivenA(L label, A attrib)
	{
		auto it = data.find({ label, attrib });
		return it == data.end() ? 0 : it->second / attributeSums[attrib];
	}

	void summary()
	{
		for (auto e: labelSums)
		{
			L label = e.first;
			cout << "P(" << label << ") = " << getLabelProbability(label) << '\n';
			for (auto f: attributeSums)
			{
				A attrib = f.first;
				cout << "   " << "P(" << label << "|" << attrib << ") = " << getConditionalProbabilityLgivenA(label, attrib) << '\n';
			}
		}

		cout << "\n";

		for (auto f: attributeSums)
		{
			A attrib = f.first;
			cout << "P(" << attrib << ") = " << getAttributeProbability(attrib) << '\n';
			for (auto e: labelSums)
			{
				L label = e.first;
				cout << "   " << "P(" << attrib << "|" << label << ") = " << getConditionalProbabilityAgivenL(attrib, label) << '\n';
			}
		}

		cout << "\n";
	}

	L predictLabel(A attrib)
	{
		L result {};
		auto it = attributeSums.find(attrib);
		if (it == attributeSums.end()) return result;

		double maxProbability = -1;
		for (auto e: data)
		{
			if (e.first.second == attrib)
			{
				L label = e.first.first;
				double p = getConditionalProbabilityLgivenA(label, attrib);
				if (p > maxProbability)
				{
					maxProbability = p;
					result = label;
				}
			}
		}

		return result;
	}
};

int main()
{
	istringstream in (readFile2("data.txt"));
	NaiveBayes<string, Attribute> B;
	B.readData(in);
	B.calcSums();
	B.summary();

	Attribute attrib;
	attrib = { "word", "cpp" };
	cout << "Predictor for " << attrib << " is " << B.predictLabel(attrib) << '\n';
	return 0;
}
Last edited on
Quote from stackoverflow.com :

Results (Average of 100 runs; timed using gettimeofday, file was 40 paragraphs of lorem ipsum):

readFile1: 764
readFile2: 104
readFile3: 129
readFile4: 402

The implementations:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
string readFile1(const string &fileName)
{
    ifstream f(fileName.c_str());
    return string(std::istreambuf_iterator<char>(f),
            std::istreambuf_iterator<char>());
}

string readFile2(const string &fileName)
{
    ifstream ifs(fileName.c_str(), ios::in | ios::binary | ios::ate);

    ifstream::pos_type fileSize = ifs.tellg();
    ifs.seekg(0, ios::beg);

    vector<char> bytes(fileSize);
    ifs.read(&bytes[0], fileSize);

    return string(&bytes[0], fileSize);
}

string readFile3(const string &fileName)
{
    string data;
    ifstream in(fileName.c_str());
    getline(in, data, string::traits_type::to_char_type(
                      string::traits_type::eof()));
    return data;
}

string readFile4(const std::string& filename)
{
    ifstream file(filename.c_str(), ios::in | ios::binary | ios::ate);

    string data;
    data.reserve(file.tellg());
    file.seekg(0, ios::beg);
    data.append(istreambuf_iterator<char>(file.rdbuf()),
                istreambuf_iterator<char>());
    return data;
}
Last edited on
Topic archived. No new replies allowed.