1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
|
#include <iostream>
#include <fstream>
#include <tuple>
#include <unordered_map>
#include <algorithm>
#include <boost/filesystem.hpp>
using byte = unsigned char ;
using trigram = std::tuple<byte,byte,byte> ;
struct hash_trigram
{
std::size_t operator() ( trigram t ) const
{
static const std::hash<unsigned int> hash_uint ;
byte a, b, c ;
std::tie( a, b, c ) = t ;
return hash_uint( (a<<16) + (b<<8) + c ) ;
}
};
using vocab_map_t = std::unordered_map< trigram, int, hash_trigram > ;
void populate( vocab_map_t& ht, std::istream& stm )
{
byte tg[3] ;
if( stm >> tg[1] && stm >> tg[2] )
{
while( stm >> tg[0] ) // read the next byte into tg[0]
{
std::rotate( tg, tg+1, tg+3 ) ; // rotate left by one
++ht[ std::make_tuple( tg[0], tg[1], tg[2] ) ] ; // insert or increment count
}
}
}
void populate( vocab_map_t& vocab_map, const boost::filesystem::path& p )
{
using namespace boost::filesystem ;
if( exists(p) )
{
if( is_directory(p) )
{
for( auto iter = directory_iterator(p) ; iter != directory_iterator() ; ++iter )
populate( vocab_map, iter->path() ) ;
}
else if( is_regular_file(p) )
{
std::ifstream file( p.string(), std::ios::binary ) ;
populate( vocab_map, file ) ;
}
}
}
int main()
{
vocab_map_t vocab_map ;
const boost::filesystem::path test_path = "." ;
populate( vocab_map, test_path ) ;
// check it out by printing out ten entries
std::cout << "vocab_map contains " << vocab_map.size() << " entries.\n"
<< "the first ten are:\n" ;
int cnt = 0 ;
for( const auto& pair : vocab_map )
{
int a, b, c ;
std::tie(a,b,c) = pair.first ;
std::cout << " (" << a << ',' << b << ',' << c << "): " << pair.second << '\n' ;
if( ++cnt == 10 ) break ;
}
}
|