21 #include <unordered_map>
24 #include "UTF8StringSlice.hpp"
30 typedef UTF8StringSlice::LengthType LengthType;
38 void Extract(
const string& text) {
42 CalculateSuffixEntropy();
45 CalculatePrefixEntropy();
47 ExtractWordCandidates();
52 void SetFullText(
const string& fullText) {
56 void SetFullText(
const char* fullText) {
60 void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
62 void SetWordMinLength(
const LengthType _wordMinLength) {
63 wordMinLength = _wordMinLength;
66 void SetWordMaxLength(
const LengthType _wordMaxLength) {
67 wordMaxLength = _wordMaxLength;
70 void SetPrefixSetLength(
const LengthType _prefixSetLength) {
71 prefixSetLength = _prefixSetLength;
74 void SetSuffixSetLength(
const LengthType _suffixSetLength) {
75 suffixSetLength = _suffixSetLength;
79 void SetPreCalculationFilter(
const std::function<
81 preCalculationFilter = filter;
84 void SetPostCalculationFilter(
const std::function<
86 postCalculationFilter = filter;
89 void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
91 void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
93 const vector<UTF8StringSlice8Bit>& Words()
const {
return words; }
95 const vector<UTF8StringSlice8Bit>& WordCandidates()
const {
96 return wordCandidates;
102 double suffixEntropy;
103 double prefixEntropy;
124 void ExtractSuffixes();
126 void ExtractPrefixes();
128 void ExtractWordCandidates();
130 void CalculateFrequency();
132 void CalculateCohesions();
134 void CalculateSuffixEntropy();
136 void CalculatePrefixEntropy();
158 double CalculateEntropy(
const std::unordered_map<
161 LengthType wordMinLength;
162 LengthType wordMaxLength;
163 LengthType prefixSetLength;
164 LengthType suffixSetLength;
166 preCalculationFilter;
168 postCalculationFilter;
170 bool prefixesExtracted;
171 bool suffixesExtracted;
172 bool frequenciesCalculated;
173 bool wordCandidatesExtracted;
174 bool cohesionsCalculated;
175 bool prefixEntropiesCalculated;
176 bool suffixEntropiesCalculated;
180 size_t totalOccurrence;
181 double logTotalOccurrence;
182 vector<UTF8StringSlice8Bit> prefixes;
183 vector<UTF8StringSlice8Bit> suffixes;
184 vector<UTF8StringSlice8Bit> wordCandidates;
185 vector<UTF8StringSlice8Bit> words;
188 friend class PhraseExtractTest;