Another word checker

closed account (48bpfSEw)
It can take a long time to check all anknown words in MS-Words especially the text is mixed with german, english and specific economy words.

I wrote a little solution in Microsoft Visual Community 2015

First you need two files.

File "Known.txt" contains all the known words :

And
Are
But
By
Disaster
Earth
Except
Hold
IF
If
Kings
Man
Or
To
Triumph
etc.


File Words.txt contains the text you want to check

If you can keep your head when all about you
Are losing theirs and blaming it on you,
If you can trust yourself when all men doubt you,
But make allowance for their doubting too;


And next you need the code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include "stdafx.h"

#include <iostream>
#include <strstream>
#include <iomanip>
#include <sstream>
#include <string>
#include <fstream>
#include <streambuf>
#include <vector>
#include <map>
#include <list>
#include <algorithm>

typedef std::map<int, std::vector<char*>>		MapWords;
typedef MapWords::iterator						ItWords;

void prepare(MapWords &map, char*ptr) {

	int ic = 0;
	char* ptrFirst = ptr;
	for (char* ptr = ptrFirst; *ptr; ++ptr) {
		char& c = *ptr;
		// if (! isalpha(c)) {
		if (!((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
			if (ic > 0) {
				map[ic].push_back(ptrFirst);
				
				for (++ptr; *ptr; ++ptr) {
					char& c = *ptr;
					// if (isalpha(c)) {
					if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
						ptrFirst = ptr;
						break;
					}
				}
			}
			ic = 0;
		}
		ic++;
	}
}

std::string getn(char* ptr, int iSize) {
	std::ostringstream	os;

	for (int i = 0; i < iSize; i++)
		os << *(ptr + i);

	return os.str();
}

void outn(char* ptr, int iSize) {
	for (int i = 0; i < iSize; i++)
		std::cout << *(ptr + i);
}

bool cmpn(char* ptr1, char* ptr2, int iSize) {
	for (int i = 0; i < iSize; i++)
		if (*(ptr1 + i) != *(ptr2 + i))
			return false;

	return true;
}

void output(MapWords &map, char*ptr) {
	std::cout << "map: " << ptr << std::endl;

	for (ItWords it1 = map.begin(); it1 != map.end(); ++it1) {
		std::cout << "words with size: " << it1->first << std::endl;
		for (std::vector<char*>::iterator it2 = it1->second.begin(); it2 != it1->second.end(); ++it2) {
			std::cout << "\t";
			outn(*it2, it1->first);
			std::cout << std::endl;
		}
	}
}

void compare(MapWords &mapKnown, MapWords &mapWords) {

	std::list<std::string> lstUsed;

	std::cout << "[comparing] List of not found words:" << std::endl;

	for (ItWords itWords = mapWords.begin(); itWords != mapWords.end(); ++itWords) {

		ItWords itKnown = mapKnown.find(itWords->first);
		if (itKnown == mapKnown.end()) {
			std::cout << "size " << itWords->first << " not found!" << std::endl;


			for (std::vector<char*>::iterator it2 = itWords->second.begin(); it2 != itWords->second.end(); ++it2) {

				std::string strWord = getn(*it2, itWords->first);
				std::list<std::string>::iterator itUsed = find(lstUsed.begin(), lstUsed.end(), strWord);
				if (itUsed != lstUsed.end())
					continue;
				lstUsed.push_back(strWord);


				std::cout << "\t";
				outn(*it2, itWords->first);
				std::cout << std::endl;
			}

			continue;
		}



		for (std::vector<char*>::iterator itWordsSec = itWords->second.begin(); itWordsSec != itWords->second.end(); ++itWordsSec) {

			bool boFound = false;
			for (std::vector<char*>::iterator itKnownSec = mapKnown[itWords->first].begin(); itKnownSec != mapKnown[itWords->first].end(); ++itKnownSec) {
				if (cmpn(*itWordsSec, *itKnownSec, itWords->first)) {
					boFound = true;
					break;
				}
			}

			if (!boFound) {
				std::cout << "\t";
				outn(*itWordsSec, itWords->first);
				std::cout << std::endl;
			}
		}
	}
}

int main(void) {

	std::ifstream tknown("known.txt");
	std::string strKnown((std::istreambuf_iterator<char>(tknown)),
		std::istreambuf_iterator<char>());

	std::ifstream twords("words.txt");
	std::string strWords((std::istreambuf_iterator<char>(twords)),
		std::istreambuf_iterator<char>());

	// remove all \r from string
	strKnown.erase(std::remove(strKnown.begin(), strKnown.end(), '\r'), strKnown.end());
	strWords.erase(std::remove(strWords.begin(), strWords.end(), '\r'), strWords.end());

	char* ptrKnown = (char*)strKnown.c_str();
	char* ptrWords = (char*)strWords.c_str();

	MapWords mapKnown;
	MapWords mapWords;

	prepare(mapKnown, ptrKnown);
	prepare(mapWords, ptrWords);

	// output(mapKnown, "Known");
	// output(mapWords, "Words");

	compare(mapKnown, mapWords);

	std::cout << "Press any key!";
	system("pause");

	return 0;
}



The Result:
t
t
t
s
s
em
ll
Rudyard
Kipling


Source in Folder "WordCheck"

https://www.dropbox.com/sh/idqo76tmx138sqs/AABszz90JjH3x88RY8zzK3sza?dl=0
Last edited on
closed account (48bpfSEw)
may be someone can find out why I get an exception when I use isalpha
* prepare() may try to access past the null terminator. The inner for will leave ptr at the null terminator, then the outer for will increment the pointer, moving it past the end of the string, then attempt to dereference it. At this point the behavior of the program becomes undefined.
* You're casting away the constness of the pointer returned by std::string::c_str(). This is dangerous. Only use const char * to store these pointers.
The Result:
t
t
t
s
s
em
ll
Rudyard
Kipling
¿what the hell does that mean?
closed account (48bpfSEw)
@helios, thank you... I'm still debugging the errors in prepare(). It seems a little bit tricky.


@ne555, the code splits currently only alpha characters.

"don’t" for example is splitted in "don" and "t"

"don" is defined as known word in the file known.txt
but "t" is not defined in it.

additionaly the code is not optimized for redundance.

"don't" appears in the file three times.
closed account (48bpfSEw)
ok, I found the bug:

1
2
3
	
char c = '’';
isalpha(c);


throws this exception:

Debug Assertion Failed!
Programm: .... File... isctype.cpp
Line 42

Expression: c >= -1 && c <= 255



Workarround: deleting all these characters before preparation(): ’—‘


You'll find the correct source code on my dropbox.
Last edited on
Topic archived. No new replies allowed.