21 #include <unordered_map>
24 #include "UTF8StringSlice.hpp"
30 typedef UTF8StringSlice::LengthType LengthType;
38 void Extract(
const string& text) {
42 CalculateSuffixEntropy();
45 CalculatePrefixEntropy();
47 ExtractWordCandidates();
52 void SetFullText(
const string& fullText) {
56 void SetFullText(
const char* fullText) {
60 void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
62 void SetWordMinLength(
const LengthType _wordMinLength) {
63 wordMinLength = _wordMinLength;
66 void SetWordMaxLength(
const LengthType _wordMaxLength) {
67 wordMaxLength = _wordMaxLength;
70 void SetPrefixSetLength(
const LengthType _prefixSetLength) {
71 prefixSetLength = _prefixSetLength;
74 void SetSuffixSetLength(
const LengthType _suffixSetLength) {
75 suffixSetLength = _suffixSetLength;
79 void SetPreCalculationFilter(
82 preCalculationFilter = filter;
85 void SetPostCalculationFilter(
88 postCalculationFilter = filter;
91 void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
93 void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
95 const vector<UTF8StringSlice8Bit>& Words()
const {
return words; }
97 const vector<UTF8StringSlice8Bit>& WordCandidates()
const {
98 return wordCandidates;
104 double suffixEntropy;
105 double prefixEntropy;
126 void ExtractSuffixes();
128 void ExtractPrefixes();
130 void ExtractWordCandidates();
132 void CalculateFrequency();
134 void CalculateCohesions();
136 void CalculateSuffixEntropy();
138 void CalculatePrefixEntropy();
160 double CalculateEntropy(
164 LengthType wordMinLength;
165 LengthType wordMaxLength;
166 LengthType prefixSetLength;
167 LengthType suffixSetLength;
169 preCalculationFilter;
171 postCalculationFilter;
173 bool prefixesExtracted;
174 bool suffixesExtracted;
175 bool frequenciesCalculated;
176 bool wordCandidatesExtracted;
177 bool cohesionsCalculated;
178 bool prefixEntropiesCalculated;
179 bool suffixEntropiesCalculated;
183 size_t totalOccurrence;
184 double logTotalOccurrence;
185 vector<UTF8StringSlice8Bit> prefixes;
186 vector<UTF8StringSlice8Bit> suffixes;
187 vector<UTF8StringSlice8Bit> wordCandidates;
188 vector<UTF8StringSlice8Bit> words;
191 friend class PhraseExtractTest;