Remove words from a string

EDIT: So I started with removing stopwords form a string, but rather than editing the string itself, I take each word form the string and sepatate them into a vector<string>;

-------------

My project is to create a function that recieves a string(by reference), then removes the stop words from that string.
I have the stop words sorted in the file stopwords.txt
e.g.
1
2
3
4
a
about
above
across


The first part of my function takes each stop word and puts it in vector<string>stopwords. This part works fine.

The second part should remove each instance of the stop word from the string, but an error occurs in #include<algorithm>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;

void filterStopWords(string &str) {
	string lineread;
	ifstream sw("stopwords.txt");
	vector<string>stopwords;
	while (sw.good()) {
		getline(sw, lineread);
		stopwords.push_back(lineread);
	}
	sw.close();
	
	
	string::iterator itr, itr2; // two iterators to select stop words
	for (unsigned int i=0; i<stopwords.size(); i++) { // for every stopword in vector<string>stopwords...
		int reps = Count(stopwords.at(i), str); // count number of occurences of stop word
		for (unsigned int j=0;j<reps; j++) { // for every time stopword.at(i) occurs...
			itr = find(str.begin(),str.end(),stopwords.at(i)); // first iterator set to position of stop word
			itr2 = stopwords.at(i).end(); // second iterator set to the size of stop word
			str.erase(itr, itr2); // erase specific instance of stop word
		}
	}
}

the Count() function shouldn't be the problem, but I'll post it just in case...
1
2
3
4
5
6
7
8
9
10
int Count(const string & str, const string & obj )	//function counts how many instances of a substring are in a string
{
	int n = 0;
	string::size_type pos = 0;
	while((pos = obj.find(str, pos)) != string::npos ) {
		n++;
		pos += str.size();
	}
	return n;
}


P.S. the string that is passed to my function is about a megabyte is size, so efficiency is somewhat important
Last edited on
In line 23, the iterators for str, when dereferenced, return a char. You're trying to compare a char to a std::string.

If efficiency is important, get rid of the Count function altogether. Doing things twice isn't efficient.

Should you only be considering removing entire words? IOW, if "hat" is a word in the string, and "a" is a stop word, should "hat" become "ht"? Does case matter?

The stop words in the text file appear to be sorted, are they?

but an error occurs in #include<algorithm>


In the future, please include the text of the error in your post.
o.k. So I got my code to work using a different and simpler method... Thanks for the help.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
vector<string> getWords(string &str) {
	vector<string> tmpvec;
	string word, letter;
	unsigned int i=0;
	do {
		letter=str.at(i);// char[i] in the string is put in letter
		if (letter==" ") {
			tmpvec.push_back(word);// word is complete
			while (  (letter==" ") && (i < str.length())  ) {// while whitespace characters are in a row, skip them.		
				i++;
				letter=str.at(i);
			}
			word.clear();
		} else {
			word.append(letter);// append letter to the end of word
			i++;
		}
		if (i == str.length())
			tmpvec.push_back(word);
	} while (i < str.length());
	return tmpvec;
}



EDIT: Now that the words are separated, filterStopWords now takes a vector of strings containing individual words.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
void filterStopWords(vector<string>&str) {
	string lineread;
	ifstream sw("stopwords.txt");
	vector<string>stopwords;
	while (sw.good()) {
		getline(sw, lineread);
		stopwords.push_back(lineread);
	}
	sw.close();	
	for (unsigned int i=0; i < str.size(); i++) {
		if (  binsearch(stopwords, 0, stopwords.size(), str.at(i))  ) {
			str.at(i)="";
		}
	}
}

rather than deleting every word from the vector (very slow), stop words become blank strings. Another function filters out the blank strings.
Last edited on
Whitespace does not consist solely of spaces.

Is it just a list of space-separated words you're dealing with? No punctuation to worry about? Why is str passed by non-const reference?

Have you considered a solution using a string stream?
1
2
3
4
5
6
7
8
9
10
11
std::vector<std::string> getWords(const std::string &str) 
{
    std::vector<std::string> tokens ;
    std::istringstream is(str) ;

    std::string token ;
    while ( is >> token )
        tokens.push_back(token) ;

    return tokens ;
}


yes, no punctuation to worry about in this case, I have another function that removed them already.
1
2
3
4
void filterNumbers(string &str) {
	str.erase(remove_if(str.begin(), str.end(), &ispunct), str.end()); // Removes all punctuation
	str.erase(remove_if(str.begin(), str.end(), &isdigit), str.end()); // Removes all numbers
}


I guess I'm not modifying str... is it faster to use const? Or just good practice?


EDIT: There is a bug in the current code that when a string is passed with a whitespace at the end of the string,
"R6010
abort() has been called"
error is thrown at runtime.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#include <iostream>
#include <string>
#include <vector>
// #include <algorithm>
using namespace std;

vector<string> getWords(string &str) {
	vector<string> tmpvec;
	string word, letter;
	unsigned int i=0;
	do {
		letter=str.at(i);// char[i] in the string is put in letter
		if (letter==" ") {
			tmpvec.push_back(word);// word is complete
			while (  (letter==" ") && (i < str.length())  ) {// while whitespace characters are in a row, skip them.		
				i++;
				letter=str.at(i);
			}
			word.clear();
		} else {
			word.append(letter);// append letter to the end of word
			i++;
		}
		if (i == str.length())
			tmpvec.push_back(word);
	} while (i < str.length());
	return tmpvec;
}

int main() {
	string a="this is a test    string for        testing "; // whitespace at the end of string.
	cout << a << endl;
	vector<string>b;
	b=getWords(a);
	for (unsigned int i=0; i<b.size(); i++) {
		cout << b.at(i) << endl;
	}
	cout << "size of vector: " << b.size() << endl;
	return 0;
}
Last edited on
I guess I'm not modifying str... is it faster to use const? Or just safe practice?


It is good practice to use const if it is not your intention to modify the original, so the user of the function can know that they needn't worry about the function modifying the string, but it also affects how the function can be used.

Something like: getWords( "a really really really really long string literal") isn't doable if you take the parameter by non-const reference.


well, cire. That function is amazing... It's all working as intended now.


The final version of my code is:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <sstream>
using namespace std;

vector<string> getWords(const string &str) 
{
    vector<string> tokens ;
    istringstream is(str) ;

    string token ;
    while ( is >> token )
        tokens.push_back(token) ;

    return tokens ;
}

void compactWords(vector<string> &str) {
	// give this function a vector of strings that contains empty strings and the empty string will be removed.
	vector<string>tmpvec2;
	for (unsigned int i=0; i<str.size(); i++) {
		if (str.at(i).length() >= 2) {
			tmpvec2.push_back(str.at(i));
		}
	}
	str.clear();
	for (unsigned int i=0; i<tmpvec2.size(); i++) {
		str.push_back(tmpvec2.at(i));
	}
}

void filterStopWords(vector<string>&str) {
	string lineread;
	ifstream sw("stopwords.txt");
	vector<string>stopwords;
	while (sw.good()) {
		getline(sw, lineread);
		stopwords.push_back(lineread);
	}
	sw.close();	
	for (unsigned int i=0; i < str.size(); i++) {
		if (  binsearch(stopwords, 0, stopwords.size(), str.at(i))  ) {
			str.at(i)="";
		}
	}
}
Last edited on
Note: std::binary_search() requires that the sequence is sorted.

Using the standard library to the full:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#include <vector>
#include <string>
#include <fstream>
#include <iterator>
#include <unordered_set>
#include <algorithm>

void strip_stop_words_and_compact( std::vector<std::string>& words )
{
    static std::ifstream sw_file( "stopwords.txt" ) ;
    static std::istream_iterator<std::string> begin(sw_file), end ;
    static const std::unordered_set<std::string> stopwords( begin, end ) ;
    static const auto empty_or_stopword = []( const std::string& word )
            { return ( word.size() < 2U ) || ( stopwords.find(word) != stopwords.end() ) ; } ;

    words.erase( std::remove_if( words.begin(), words.end(), empty_or_stopword ), words.end() ) ;
}

Topic archived. No new replies allowed.