Tree Compression with Top Trees Revisited
repair.cpp
Go to the documentation of this file.
1 #include <iostream>
2 #include <string>
3 
4 // Data Structures
5 #include "Edges.h"
6 #include "Nodes.h"
7 #include "OrderedTree.h"
8 
9 // Algorithms
10 #include "RePair/Coder.h"
11 #include "RePair/Prepair.h"
12 #include "RePair/RePair.h"
13 
14 // Utils
15 #include "ArgParser.h"
16 #include "BPString.h"
17 #include "Timer.h"
18 #include "XML.h"
19 
20 
21 using std::cout;
22 using std::endl;
23 using std::string;
24 
25 template <typename InType, typename DataType>
26 long long compress(vector<InType> &data, const std::string &description, const bool skipPrepair = false, const bool verbose = false) {
27  Timer timer;
28  cout << "RePair-ing the " << description;
29 
30 
31  std::unordered_map<InType, InType> inputTransformations;
32  if (!skipPrepair) {
33  cout << ", preparing… " << flush;
34  RePair::Prepair<InType>::prepare(data, inputTransformations);
35  cout << timer.getAndReset() << "ms";
36  }
37 
38  cout << ", initialising… " << flush;
39  std::vector<DataType> output;
41  cout << timer.getAndReset() << "ms, compressing… " << flush;
42 
43  repair.compress(output);
44  RePair::Dictionary<DataType> &dictionary = repair.getDictionary();
45  cout << "done (" << timer.getAndReset() << "ms)" << endl;
46 
47  cout << "Compressed representation has " << output.size() << " symbols, dictionary has " << dictionary.size() << " entries (" << dictionary.numSymbols() << " symbols)" << endl;
48 
49  if (verbose) {
50  for (auto elem : output) {
51  std::cout << elem << " ";
52  }
53  std::cout << std::endl << dictionary;
54  }
55 
56  RePair::Coder<DataType> coder(output, dictionary);
57  if (!skipPrepair) {
58  coder.codeInputMapping(inputTransformations);
59  }
60  coder.compute();
61  cout << coder.huff << " + " << coder.huff.getBitsForTableLabels() << " bits = " << (coder.getBitsNeeded() + 7) / 8 << " Bytes" << endl;
62  return coder.getBitsNeeded();
63 }
64 
65 int main(int argc, char **argv) {
66  ArgParser argParser(argc, argv);
67  string filename = "data/1998statistics.xml";
68  if (argParser.numDataArgs() > 0) {
69  filename = argParser.getDataArg(0);
70  }
71  const bool verbose = argParser.isSet("v");
72 
74  Labels<string> labels;
75  XmlParser<OrderedTree<TreeNode, TreeEdge>>::parse(filename, tree, labels);
76  cout << tree.summary() << "; Height: " << tree.height() << " Avg depth: " << tree.avgDepth() << endl;
77 
78  Timer timer;
79  std::vector<unsigned char> labelnames;
80  std::vector<bool> bpstring;
81  BPString::template fromTree<TreeNode, TreeEdge, string>(tree, labels, bpstring, labelnames);
82 
83  cout << "bpstring with " << bpstring.size() << " bits, " << labelnames.size() << " bytes of labels (transformation took " << timer.getAndReset() << "ms)" << endl;
84 
85  long long totalSize(0);
86  totalSize += compress<bool, int>(bpstring, "tree structure", false, verbose);
87  totalSize += compress<unsigned char, int>(labelnames, "labels", verbose);
88  cout << "Output file needs " << totalSize << " bits (" << (totalSize + 7)/8 << " Bytes)" << endl;
89 
90  cout << "RESULT"
91  << " file=" << filename
92  << " compressed=" << totalSize
93  << " bpstringbits=" << bpstring.size()
94  << " labelstringits=" << labelnames.size() * 8
95  << endl;
96  return 0;
97 }
int height() const
Definition: OrderedTree.h:571
long long compress(vector< InType > &data, const std::string &description, const bool skipPrepair=false, const bool verbose=false)
Definition: repair.cpp:26
void compute()
Definition: Coder.h:16
Ordered tree data structure.
Definition: OrderedTree.h:37
string getDataArg(const int index) const
get a data argument by its index (among the data arguments)
Definition: ArgParser.h:68
void codeInputMapping(std::unordered_map< InputType, InputType > &mapping)
Definition: Coder.h:35
double avgDepth() const
Definition: OrderedTree.h:579
bool isSet(const string &arg) const
check whether an argument was set
Definition: ArgParser.h:58
static void prepare(std::vector< InType > &vec, std::unordered_map< InType, InType > &transformations)
Definition: Prepair.h:16
uint numDataArgs() const
the number of unnamed data arguments
Definition: ArgParser.h:63
long long getBitsNeeded() const
Definition: Coder.h:45
void compress(std::vector< DataType > &out)
Definition: RePair.h:28
Read an XML file into a tree, using RapidXml.
Definition: XML.h:23
Parse command-line arguments.
Definition: ArgParser.h:21
HuffmanBuilder< DataType > huff
Definition: Coder.h:53
encode RePair output
Definition: Coder.h:13
Main RePair compression algorithm.
Definition: RePair.h:16
DataType numSymbols() const
Definition: Dictionary.h:34
Dictionary< DataType > & getDictionary()
Definition: RePair.h:53
string summary() const
A one-line summary of the tree.
Definition: OrderedTree.h:348
A flexible timer.
Definition: Timer.h:14
int main(int argc, char **argv)
Definition: repair.cpp:65
ReturnType getAndReset()
Definition: Timer.h:29
long long getBitsForTableLabels() const
Definition: Huffman.h:137
A key-value label storage.
Definition: Labels.h:107
uint size() const
Definition: Dictionary.h:30