Code review for LZW version 3

Hello. I'm working on an article about LZW compression.

This is the source code of the third program, as I want to take the reader through pseudocode, naive implementation, hash table implementation, and then this, progressively optimizing the program and explaining why and how. Final version was supposed to be a Boost'ed serious utility, but this all proves much harder than I thought, so I probably won't go that far.

Let me know what bugs you see, or missed optimization opportunities, or whatever you don't feel belongs or is dubious. Thanks.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
///
/// @file
/// @author Julius Pettersson
/// @copyright MIT/Expat License.
/// @brief LZW file archiver
/// @version 3
///
/// This is the C++11 implementation of a Lempel-Ziv-Welch single-file command-line archiver.
/// It uses the simpler fixed-width code compression method.
/// It was written with Doxygen comments.
///
/// http://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch
/// http://marknelson.us/2011/11/08/lzw-revisited/
/// http://www.cs.duke.edu/csed/curious/compression/lzw.html
/// http://warp.povusers.org/EfficientLZW/index.html
/// http://en.cppreference.com/
/// http://www.doxygen.org/
///

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <exception>
#include <fstream>
#include <ios>
#include <iostream>
#include <istream>
#include <limits>
#include <map>
#include <memory>
#include <ostream>
#include <stdexcept>
#include <string>
#include <utility>
#include <vector>

namespace {

typedef std::uint16_t CodeType;   ///< Type used to store and retrieve codes.

namespace globals {

/// Dictionary Maximum Size (when reached, the dictionary will be reset)
const CodeType dms = std::numeric_limits<CodeType>::max();

} // namespace globals

///
/// @brief Compresses the contents of `is` and writes the result to `os`.
/// @param [in] is      input stream
/// @param [out] os     output stream
///
void compress(std::istream &is, std::ostream &os)
{
    std::map<std::pair<CodeType, char>, CodeType> dictionary;
    //std::map<std::vector<char>, CodeType> dictionary;

    // "named" lambda function, used to reset the dictionary to its initial contents
    const auto reset_dictionary = [&dictionary] {
        dictionary.clear();

        const int minc = std::numeric_limits<char>::min();
        const int maxc = std::numeric_limits<char>::max();

        for (int c = minc; c <= maxc; ++c)
            dictionary.insert({{globals::dms, static_cast<char> (c)}, dictionary.size()});
    };

    reset_dictionary();

    CodeType i{globals::dms}; // Index
    char c;

    while (is.get(c))
    {
        // dictionary's maximum size was reached
        if (dictionary.size() == globals::dms)
            reset_dictionary();

        if (dictionary.count({i, c}) == 0)
        {
            dictionary.insert({{i, c}, dictionary.size()});
            os.write(reinterpret_cast<const char *> (&i), sizeof (CodeType));
            i = dictionary.at({globals::dms, c});
        }
        else
            i = dictionary.at({i, c});
    }

    if (i != globals::dms)
        os.write(reinterpret_cast<const char *> (&i), sizeof (CodeType));
}

///
/// @brief Decompresses the contents of `is` and writes the result to `os`.
/// @param [in] is      input stream
/// @param [out] os     output stream
///
void decompress(std::istream &is, std::ostream &os)
{
    std::vector<std::pair<CodeType, char>> dictionary;

    // "named" lambda function, used to reset the dictionary to its initial contents
    const auto reset_dictionary = [&dictionary] {
        dictionary.clear();
        dictionary.reserve(globals::dms);

        const int minc = std::numeric_limits<char>::min();
        const int maxc = std::numeric_limits<char>::max();

        for (int c = minc; c <= maxc; ++c)
            dictionary.push_back({globals::dms, static_cast<char> (c)});
    };

    // "named" lambda function, used to rebuild strings
    const auto rebuild_string = [&dictionary](CodeType k) -> std::vector<char> {
        std::vector<char> s; // String

        do
        {
            s.push_back(dictionary.at(k).second);
            k = dictionary.at(k).first;
        }
        while (k != globals::dms);

        std::reverse(s.begin(), s.end());
        return s;
    };

    reset_dictionary();

    CodeType i{globals::dms}; // Index
    CodeType k; // Key
    std::vector<char> s; // String

    while (is.read(reinterpret_cast<char *> (&k), sizeof (CodeType)))
    {
        // dictionary's maximum size was reached
        if (dictionary.size() == globals::dms)
            reset_dictionary();

        if (k > dictionary.size())
            throw std::runtime_error("invalid compressed code");

        if (k == dictionary.size())
            dictionary.push_back({i, rebuild_string(i).front()});
        else
        if (i != globals::dms)
            dictionary.push_back({i, rebuild_string(k).front()});

        s = rebuild_string(k);
        os.write(&s.front(), s.size());
        i = k;
    }

    if (!is.eof() || is.gcount() != 0)
        throw std::runtime_error("corrupted compressed file");
}

///
/// @brief Prints usage information and a custom error message.
/// @param s    custom error message to be printed
/// @param su   Show Usage information
///
void print_usage(const std::string &s = "", bool su = true)
{
    if (!s.empty())
        std::cerr << "\nERROR: " << s << '\n';

    if (su)
    {
        std::cerr << "\nUsage:\n";
        std::cerr << "\tprogram -flag input_file output_file\n\n";
        std::cerr << "Where `flag' is either `c' for compressing, or `d' for decompressing, and\n";
        std::cerr << "`input_file' and `output_file' are distinct files.\n\n";
        std::cerr << "Examples:\n";
        std::cerr << "\tlzw.exe -c license.txt license.lzw\n";
        std::cerr << "\tlzw.exe -d license.lzw new_license.txt\n";
    }

    std::cerr << std::endl;
}

} // namespace

///
/// @brief Actual program entry point.
/// @param argc             number of command line arguments
/// @param [in] argv        array of command line arguments
/// @retval EXIT_FAILURE    for failed operation
/// @retval EXIT_SUCCESS    for successful operation
///
int main(int argc, char *argv[])
{
    if (argc != 4)
    {
        print_usage("Wrong number of arguments.");
        return EXIT_FAILURE;
    }

    enum class Mode {
        Compress,
        Decompress
    };

    Mode m;

    if (std::string(argv[1]) == "-c")
        m = Mode::Compress;
    else
    if (std::string(argv[1]) == "-d")
        m = Mode::Decompress;
    else
    {
        print_usage(std::string("flag `") + argv[1] + "' is not recognized.");
        return EXIT_FAILURE;
    }

    const std::size_t buffer_size{1024 * 1024};

    // these custom buffers should be larger than the default ones
    const std::unique_ptr<char[]> input_buffer(new char[buffer_size]);
    const std::unique_ptr<char[]> output_buffer(new char[buffer_size]);

    std::ifstream input_file;
    std::ofstream output_file;

    input_file.rdbuf()->pubsetbuf(input_buffer.get(), buffer_size);
    input_file.open(argv[2], std::ios_base::binary);

    if (!input_file.is_open())
    {
        print_usage(std::string("input_file `") + argv[2] + "' could not be opened.");
        return EXIT_FAILURE;
    }

    output_file.rdbuf()->pubsetbuf(output_buffer.get(), buffer_size);
    output_file.open(argv[3], std::ios_base::binary);

    if (!output_file.is_open())
    {
        print_usage(std::string("output_file `") + argv[3] + "' could not be opened.");
        return EXIT_FAILURE;
    }

    try
    {
        input_file.exceptions(std::ios_base::badbit);
        output_file.exceptions(std::ios_base::badbit | std::ios_base::failbit);

        if (m == Mode::Compress)
            compress(input_file, output_file);
        else
        if (m == Mode::Decompress)
            decompress(input_file, output_file);
    }
    catch (const std::ios_base::failure &f)
    {
        print_usage(std::string("File input/output failure: ") + f.what() + '.', false);
        return EXIT_FAILURE;
    }
    catch (const std::exception &e)
    {
        print_usage(std::string("Caught exception: ") + e.what() + '.', false);
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
Topic archived. No new replies allowed.