liblevenshtein 4.0.0
A library for generating Finite State Transducers based on Levenshtein Automata.
Loading...
Searching...
No Matches
serializer.cpp
Go to the documentation of this file.
1#include <cstdint>
2#include <filesystem>
3#include <fstream>
4#include <iostream>
5#include <map>
6#include <set>
7#include <utility>
8
12#include "liblevenshtein/proto/liblevenshtein.pb.h"
14
15namespace fs = std::filesystem;
16
17namespace llp = liblevenshtein::proto;
18
19namespace liblevenshtein {
20
21auto serialize_protobuf(Dawg *dawg, const fs::path &path) -> bool {
22 return serialize_protobuf(dawg, path.generic_string());
23}
24
25auto serialize_protobuf(Dawg *dawg, const std::string &path) -> bool {
26 return serialize_protobuf(dawg, path.c_str());
27}
28
29auto serialize_protobuf(Dawg *dawg, const char *path) -> bool {
30 std::fstream output(path, std::fstream::out | std::fstream::trunc |
31 std::fstream::binary);
33 output.close();
34 return serialized;
35}
36
37auto serialize_protobuf(Dawg *dawg, std::ostream &output) -> bool {
38 llp::Dictionary *dict_proto = to_protobuf(dawg);
39 bool serialized = dict_proto->SerializeToOstream(&output);
40 delete dict_proto;
41 return serialized;
42}
43
44auto deserialize_protobuf(const fs::path &path) -> Dawg * {
45 return deserialize_protobuf(path.generic_string());
46}
47
48auto deserialize_protobuf(const std::string &path) -> Dawg * {
49 return deserialize_protobuf(path.c_str());
50}
51
52auto deserialize_protobuf(const char *path) -> Dawg * {
53 std::fstream input(path, std::fstream::in | std::fstream::binary);
55 input.close();
56 return dawg;
57}
58
59auto deserialize_protobuf(std::istream &input) -> Dawg * {
60 Dawg *dawg = nullptr;
61 llp::Dictionary dict_proto;
62 if (dict_proto.ParseFromIstream(&input)) {
64 }
65 return dawg;
66}
67
68void collect_nodes(DawgNode *source, std::set<uint64_t> &node_ids,
69 std::set<uint64_t> &final_node_ids) {
70 auto source_id = reinterpret_cast<uint64_t>(source);
71 if (!node_ids.contains(source_id)) {
72 node_ids.insert(source_id);
73 if (source->is_final()) {
75 }
76 source->for_each_edge([&](char label, DawgNode *target) {
78 });
79 }
80}
81
83 std::map<std::pair<uint64_t, char>, uint64_t> &edges) {
84 auto source_id = reinterpret_cast<uint64_t>(source);
85 source->for_each_edge([&](char label, DawgNode *target) {
86 std::pair<uint64_t, char> key = std::make_pair(source_id, label);
87 if (!edges.contains(key)) {
88 auto target_id = reinterpret_cast<uint64_t>(target);
90 collect_edges(target, edges);
91 }
92 });
93}
94
95auto to_protobuf(Dawg *dawg) -> llp::Dictionary * {
96 auto *dict_proto = new llp::Dictionary();
97 DawgNode *root = dawg->root();
98
99 std::set<uint64_t> node_ids;
100 std::set<uint64_t> final_node_ids;
102 for (const uint64_t &node_id : node_ids) {
103 dict_proto->add_node_id(node_id);
104 }
105 for (const uint64_t &final_node_id : final_node_ids) {
106 dict_proto->add_final_node_id(final_node_id);
107 }
108
109 std::map<std::pair<uint64_t, char>, uint64_t> edges;
110 collect_edges(root, edges);
111 for (const auto &edge : edges) {
112 uint64_t source_id = edge.first.first;
113 char label = edge.first.second;
114 uint64_t target_id = edge.second;
115 llp::Dictionary_Edge *edge_proto = dict_proto->add_edge();
116 edge_proto->set_source_id(source_id);
117 edge_proto->set_label(label);
118 edge_proto->set_target_id(target_id);
119 }
120
121 auto root_id = reinterpret_cast<uint64_t>(root);
122 dict_proto->set_root_id(root_id);
123 dict_proto->set_size(dawg->size());
124
125 return dict_proto;
126}
127
128auto from_protobuf(const llp::Dictionary &dict_proto) -> Dawg * {
129 std::set<uint64_t> final_node_ids;
130 for (int index = 0; index < dict_proto.final_node_id_size(); index += 1) {
131 uint64_t final_node_id = dict_proto.final_node_id(index);
133 }
134
135 std::map<uint64_t, DawgNode *> nodes;
136 for (int index = 0; index < dict_proto.node_id_size(); index += 1) {
137 uint64_t node_id = dict_proto.node_id(index);
138 if (!nodes.contains(node_id)) {
139 bool is_final = final_node_ids.contains(node_id);
140 auto *node = new DawgNode(is_final);
141 nodes[node_id] = node;
142 }
143 }
144
145 for (int index = 0; index < dict_proto.edge_size(); index += 1) {
146 const llp::Dictionary_Edge &edge_proto = dict_proto.edge(index);
147 uint64_t source_id = edge_proto.source_id();
148 char label = (char)edge_proto.label();
149 uint64_t target_id = edge_proto.target_id();
150 DawgNode *source = nodes[source_id];
151 DawgNode *target = nodes[target_id];
152 source->add_edge(label, target);
153 }
154
155 DawgNode *root = nodes[dict_proto.root_id()];
156 auto *dawg = new SortedDawg(root, dict_proto.size());
157 return dawg;
158}
159
160} // namespace liblevenshtein
Represents a position within one or more terms of a DAWG dictionary.
Definition dawg_node.h:20
void is_final(bool is_final)
Specifies whether this node represents a word boundary, or immediately follows an edge having the fin...
Definition dawg_node.cpp:19
void for_each_edge(const std::function< void(char, DawgNode *)> &fn) const
Iterates over each outgoing edge of this node and invokes a callback function with each edge's charac...
Definition dawg_node.cpp:27
auto add_edge(char label, DawgNode *target) -> DawgNode *
Adds a new outgoing edge to this node.
Definition dawg_node.cpp:41
A Directed Acyclic Word Graph (DAWG) maps sequences of characters to form words; the collection of wo...
Definition dawg.h:23
auto contains(const std::string &term) const -> bool
Determines whether the given term is contained within this dictionary.
Definition dawg.cpp:43
A specific type of Dawg that is constructed over lexicographically sorted terms.
Definition sorted_dawg.h:20
void query(ll::Dawg *dawg, const std::string &query_term, std::size_t max_distance)
Definition main.cpp:25
Various utilities regarding Levenshtein transducers.
Definition namespaces.dox:9
auto to_protobuf(Dawg *dawg) -> llp::Dictionary *
Serializes a Dawg to its protobuf equivalent.
void collect_nodes(DawgNode *source, std::set< uint64_t > &node_ids, std::set< uint64_t > &final_node_ids)
Collects the DawgNode IDs and final DawgNode IDs of all nodes reachable from the source.
auto from_protobuf(const llp::Dictionary &dict_proto) -> Dawg *
Deserializes a Dawg from its protobuf equivalent.
void collect_edges(DawgNode *source, std::map< std::pair< uint64_t, char >, uint64_t > &edges)
Collects the transitions from each source to its destination, and the respective character labels.
auto deserialize_protobuf(const fs::path &path) -> Dawg *
Deserializes the protobuf containing a Dawg at the given path or returns nullptr if none exists.
auto serialize_protobuf(Dawg *dawg, const fs::path &path) -> bool
Serializes the given Dawg to protobuf at the given path.