Most Common Word in a Document

Pages: 12
The only problem is that if there is more than one max item ("fine" and "you" in this case) I have to state both of them.



Do what both @keskiverto and @repeater suggested:
1. Find the maximum count.
2. Print all words that have that count.
If you can't store the most common words as you go, then go round twice. The first time round, you establish the value of MostCommon.

The second time round, output every word whose count value equals that value.



Are you absolutely certain you can't use a map<string,int> to hold word and frequency?
Failing that, bodge your own with a struct (which is C, hence allowable!) having string and int data members.
Last edited on
> if there is more than one max item ("fine" and "you" in this case) I have to state both of them.

Repeat of what keskiverto told you long ago here http://www.cplusplus.com/forum/beginner/220706/#msg1014879
Make two passes through the array:
the first pass to determine the maximum frequency count,
and then a second pass to print all the words with that frequency count.

For example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#include <iostream>
#include <string>
#include <fstream>
#include <cctype>
#include <sstream>
#include <iomanip>
// in in in in string

// return a string with characters other than alpha characters
// in the line replaced with spaces, and with all upper case characters
// converted to lower case
std::string format( std::string line )
{
    std::string result ;

    // for each character in line, if it is an alpha character, add it to result
    for( char& c : line )
    {
        if( std::isalpha(c) ) result += std::tolower(c) ;
        else result += ' ' ;
    }

    return result ;
}

// reads words one by one into the array up to num_words words
// return the number of words that were read in
int get_words( std::istream& stm, std::string words_array[], int array_sz )
{
    int num_words = 0 ; // number of words read so far

    std::string line ;

    // for each line in the input stream
    while( num_words < array_sz && std::getline( stm, line ) )
    {
        // create an input string stream to read from the formatted line
        std::istringstream str_stm( format(line) ) ;

        std::string word ;
        while( num_words < array_sz && str_stm >> word ) // for each word in the line
        {
            words_array[num_words] = word ; // add it to the words array
            ++num_words ; // and increment the count
        }
    }

    return num_words ;
}

// return he position of the smallest element in the array
int pos_smallest_element( const std::string words[], int num_words )
{
    int pos_smallest = 0 ;
    for( int i = 1 ; i < num_words ; ++i )
        if( words[i] < words[pos_smallest] ) pos_smallest = i ;
    return pos_smallest ;
}

// sort in ascending order
void sort( std::string words[], int num_words )
{
    if( num_words > 1 )
    {
        // bring the smallest element to the front
        const int pos_smallest = pos_smallest_element( words, num_words ) ;
        std::swap( words[0], words[pos_smallest] ) ;

        // sort the remaining elements
        sort( words+1, num_words-1 ) ;
    }
}

// return the frequency of the word that occurs most often
int max_frequency( std::string words[], int num_words )
{
    sort( words, num_words ) ;

    int max_freq = 1 ;
    int curr_freq = 1 ;
    for( int i = 1 ; i < num_words ; ++i )
    {
        if( words[i] == words[i-1] ) ++curr_freq ;
        else
        {
            if( curr_freq > max_freq ) max_freq = curr_freq ;
            curr_freq = 1 ;
        }
    }

    return max_freq > curr_freq ? max_freq : curr_freq ;
}

// print all the words that have occurred freq times
void print_words_with_frequency( const std::string words[], int num_words, int freq )
{
    int curr_freq = 1 ;
    for( int i = 1 ; i < num_words ; ++i )
    {
        if( words[i] == words[i-1] ) ++curr_freq ;
        else
        {
            if( curr_freq == freq ) std::cout << words[i-1] << '\n' ;
            curr_freq = 1 ;
        }
    }

    if( curr_freq == freq ) std::cout << words[num_words-1] << '\n' ;
}

int main()
{
// for testing: zebra zebra zebra zebra zebra zebra zebra zebra zebra zebra (10)
// FOR TESTING: ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA (20)

    const int MAX = 1000 ; // maximum number of words that we can read
    std::string words[MAX] ;

	std::ifstream file( __FILE__ ) ; // this file: modify as required. eg.
	                                 // std::ifstream file( "3.txt" ) ;
// for testing: ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant (20)
// FOR TESTING: ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA (30)
// FOR TESTING: ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT (40)
// for testing: zebra zebra zebra zebra zebra zebra zebra zebra zebra zebra (40)
// FOR TESTING: ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA (50)
// for testing: ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant ant (60)

	const int nwords_read = get_words( file, words, MAX ) ;

    // take it up from there
    const int max_freq = max_frequency( words, nwords_read ) ;
    std::cout << "these words occurred the most number of times (" << max_freq << "):\n" ;
    print_words_with_frequency( words, nwords_read, max_freq ) ;
}

// for testing: zebra zebra zebra zebra zebra zebra zebra zebra zebra zebra (60)
// FOR TESTING: ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA ZEBRA (70)
// FOR TESTING: ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT ANT (80)
// for testing: zebra zebra zebra zebra zebra zebra zebra zebra zebra zebra (80) 

http://coliru.stacked-crooked.com/a/2197b95d057bc826
I finally got it to work.

I really appreciate everyone's help on this. Thank you all so much.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#include <iostream>
#include <fstream>
#include <cctype>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
using namespace std;


// Prototypes
vector <string> readFile();
void filter( vector<string> &words );
template <typename T> void printAll( map<T,int> myMap );
template <typename T> void printMax( map<T,int> myMap );
string toLower( string text );
string removeEndPunctuation( string text, bool &split );
vector <string> splitWord( string text );


//======================================================================


int main()
{
   vector <string> words = readFile();                     // Read from file

   filter( words );                                        // Apply any filters

   map <string,int> freqMap;                               // Create a frequency table
   for ( string s : words ) freqMap[s]++;

   printAll<string>( freqMap );                            // Print the whole table

   cout << '\n';
   printMax<string>( freqMap );                            // Print just the largest element(s)
}


//======================================================================


vector <string> readFile()                                 // Read "raw" array from file ... anything separated by white space
{
   vector <string> result;
   string word;

   ifstream in( "text.dat" );
   while ( in >> word ) result.push_back( word );
   in.close();

   return result;
}


//======================================================================


void filter( vector<string> &words )                       // Put in here any filters that you wish to apply
{
   bool split;

   int i = 0;
   while ( i < words.size() )                              // NOTE: words.size() may change if you split words - be careful
   {
      // TURN TO LOWER CASE (for consistency)
      words[i] = toLower( words[i] );                      

      // REMOVE PUNCTUATION FROM BOTH ENDS (not the middle: e.g.  it's  is grammatically not the same as  its
      words[i] = removeEndPunctuation( words[i], split );  

      // CHECK FOR RESIDUAL EMPTY STRINGS, OR ANY THAT SHOULD BE SPLIT (determined by routine above)
      if ( words[i] == "" )                                // Empty, so remove
      {
         words.erase( words.begin() + i );                  
      }
      else if ( split )                                    // Can be further split
      {
         vector <string> toInsert = splitWord( words[i] );
         words.erase( words.begin() + i );
         words.insert( words.begin() + i, toInsert.begin(), toInsert.end() );
         i += toInsert.size();
      }
      else                  
      {
         i++;                                              // Go to next word
      }
   }
}


//======================================================================


template <typename T> void printAll( map<T,int> myMap )
{
   for ( auto e : myMap ) cout << e.first << "  " << e.second << '\n';
}


//======================================================================


template <typename T> void printMax( map<T,int> myMap )
{
   int maxFreq = 0;
   for ( auto e : myMap ) if ( e.second > maxFreq ) maxFreq = e.second;             // PASS 1
   cout << "Maximum frequency: " << maxFreq << '\n';

   cout << "Most-common words:\n";
   for ( auto e : myMap ) if ( e.second == maxFreq ) cout << e.first << '\n';       // PASS 2
}


//======================================================================


string toLower( string text )                              // Returns lower-case string
{
   string result = text;
   for ( char &c : result ) c = tolower( c );
   return result;
}


//======================================================================


string removeEndPunctuation( string text, bool &split )    // Removes punctuation from ends only;
                                                           // Also reports whether further splittable
{
   split = false;

   if ( text == "" ) return text;

   int i1 = 0, last = text.size() - 1;
   while ( i1 <= last && ispunct( text [i1] ) ) i1++;
   if ( i1 > last ) return "";

   int i2 = last;
   while ( i2 > i1 && ispunct( text [i2] ) ) i2--;

   string result = text.substr( i1, i2 - i1 + 1 );

   // Check whether it can be split at anything other than a letter or internal apostrophe
   for ( char c : result ) if ( !isalpha( c ) && c != '\'' ) split = true;

   return result;
}


//======================================================================


vector <string> splitWord( string text )                   // Split at internal numbers or punctuation other than apostrophes
{
   vector <string> result;
   string currentWord = "";

   for ( char c : text )
   {
      if ( isalpha( c ) || c == '\'' )                     // Alphabet, or internal apostrophe; still the same word.
      {
         currentWord += c;
      }
      else
      {
         if ( currentWord != "" ) result.push_back( currentWord );   // Otherwise, tag any non-empty words onto result ...
         currentWord = "";                                           // ... and start a new word afresh
      }
   }
   if ( currentWord != "" ) result.push_back( currentWord );
   return result;
}


Not sure about abolishing numbers though. British workers would find a "P" rather different to a "P45"!

As run through the source code:
a  3
above  1
afresh  1
algorithm  1
alphabet  1
also  1
and  1
any  4
anything  2
apostrophe  2
apostrophes  1
apply  2
array  1
as  1
at  2
auto  3
back  3
be  4
begin  4
bool  3
both  1
by  2
c  10
can  2
careful  1
case  2
cctype  1
change  1
char  3
check  2
close  1
common  1
consistency  1
cout  5
create  1
currentword  7
dat  1
determined  1
e  10
element  1
else  3
empty  3
end  1
ends  2
erase  2
false  1
file  2
filter  3
filters  2
first  2
for  9
freqmap  4
frequency  2
from  4
fstream  1
further  2
g  1
go  1
grammatically  1
here  1
i  26
if  11
ifstream  1
in  4
include  7
insert  1
int  10
internal  3
iostream  1
is  1
isalpha  2
ispunct  2
it  1
it's  1
its  1
just  1
largest  1
last  4
letter  1
lower  2
main  1
map  6
maxfreq  5
maximum  1
may  1
middle  1
most  1
mymap  7
n  5
namespace  1
new  1
next  1
non  1
not  2
note  1
numbers  1
only  1
onto  1
or  4
other  2
otherwise  1
pass  2
print  2
printall  3
printmax  3
prototypes  1
punctuation  3
push  3
put  1
raw  1
read  2
readfile  3
remove  2
removeendpunctuation  3
removes  1
reports  1
residual  1
result  14
return  6
returns  1
routine  1
s  3
same  2
second  4
separated  1
should  1
size  4
so  1
space  1
split  12
splittable  1
splitword  3
start  1
std  1
still  1
string  30
strings  1
substr  1
t  8
table  2
tag  1
template  4
text  15
than  2
that  2
the  5
to  3
toinsert  4
tolower  4
true  1
turn  1
typename  4
using  1
vector  11
void  6
whether  2
while  4
white  1
whole  1
wish  1
word  6
words  22
you  2

Maximum frequency: 30
Most-common words:
string

Last edited on
If using std::map<> and std::vector<> is permitted, the program can be short and sweet:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#include <iostream>
#include <string>
#include <fstream>
#include <cctype>
#include <sstream>
#include <iomanip>
#include <map>
#include <vector>

std::string format( std::string line )
{
    std::string result ;

    for( char& c : line )
    {
        if( std::isalpha(c) ) result += std::tolower(c) ;
        else result += ' ' ;
    }

    return result ;
}

std::map< std::string, std::size_t > get_words( std::istream& stm )
{
    std::map< std::string, std::size_t > words ;

    std::string line ;
    while( std::getline( stm, line ) )
    {
        std::istringstream str_stm( format(line) ) ;

        std::string this_word ;
        while( str_stm >> this_word ) ++words[this_word] ;
    }

    return words ;
}

std::map< std::size_t, std::vector<std::string> > make_frequency_map( std::map< std::string, std::size_t > words )
{
    std::map< std::size_t, std::vector<std::string> > freq_map ;
    for( const auto& pair : words ) freq_map[pair.second].push_back( std::move(pair.first) ) ;
    return freq_map ;
}

int main()
{
    std::ifstream file( __FILE__ ) ;
    const auto freq_map = make_frequency_map( get_words(file) ) ;

    // string string string string string string string string string string
    if( !freq_map.empty() )
    {
        auto iter = --freq_map.end() ;
        std::cout << "these words occurred the most number of times (" << iter->first << "):\n" ;
        for( const auto& word : iter->second ) std::cout << word << '\n' ;
    }
    // map map map map map map map map map map map map map map map map map
    // string string string string string string string string string string
}

http://coliru.stacked-crooked.com/a/d70ee3f035acc842
closed account (48T7M4Gy)
Word extraction but counting not included:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#include <iostream>
#include <fstream>
#include <string>

int main ()
{
    std::ifstream myfile ("source.txt");
    
    if (myfile.is_open())
    {
        char ch;
        std:: string word;
        
        while ( myfile >> std::noskipws >> ch )
        {
            if(isalpha(ch))
                word += tolower(ch);
            else
            {
                if(word.length() > 0)
                    std::cout << word << '\n';
                word = "";
            }
        }
        myfile.close();
    }
    else
        std::cout << "Unable to open file";
    
    return 0;
}
closed account (48T7M4Gy)
Won't scale up well but nevertheless.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#include <iostream>
#include <fstream>
#include <string>
#include <iomanip>

int main ()
{
    const int MAX_WORDS = 100;
    std::string word_list[MAX_WORDS];
    int word_count[MAX_WORDS]{0};
    
    int list_size = 0;
    int highest_frequency = 1;
    
    std::ifstream myfile ("000 copy.txt");
    
    if (myfile.is_open())
    {
        char ch;
        std:: string word;
        int index = -1;
        
        // READ FILE CHARACTER BY CHARACTER
        while ( myfile >> std::noskipws >> ch )
        {
            if(isalpha(ch))
                word += tolower(ch);
            else
            {
                if(word.length() > 0)
                {
                    index = -1;
                    for(int i = 0; i < MAX_WORDS; ++i)
                    {
                        // WORD ALREADY IN LIST
                        if(word == word_list[i])
                        {
                            index = i;
                            
                            word_count[index]++;
                            
                            if(word_count[index] > highest_frequency)
                                highest_frequency = word_count[index];
                            break;
                        }
                    }
                    // WORD NOT IN LIST
                    if(index == -1)
                    {
                        list_size++;
                        word_list[list_size] = word;
                        word_count[list_size] = 1;
                    }
                }
                word = "";
            }
        }
        myfile.close();
        
        
        // DISPLAY HIGHEST FREQUENCY WORDS
        std::cout << "Most frequent word(s):\n";
        for(int i = 0; i < list_size + 1; ++i)
        {
            if(word_count[i] == highest_frequency)
                std::cout
                << std::setw(10) << word_list[i]
                << std::setw(5) << word_count[i] << '\n';
        }
    }
    else
        std::cout << "Unable to open file";
    
    
    return 0;
}
How are you? H3ello S30
Fine! Thanks. And you?
I'm fine, too. H3ello/too./are,"are"


Most frequent word(s):
       are    3
Program ended with exit code: 0
Topic archived. No new replies allowed.
Pages: 12