Open Chinese Convert  1.1.0
A project for conversion between Traditional and Simplified Chinese
PhraseExtract.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #include <unordered_map>
22 
23 #include "Common.hpp"
24 #include "UTF8StringSlice.hpp"
25 
26 namespace opencc {
27 
28 class OPENCC_EXPORT PhraseExtract {
29 public:
30  typedef UTF8StringSlice::LengthType LengthType;
31 
33 
34  PhraseExtract();
35 
36  virtual ~PhraseExtract();
37 
38  void Extract(const string& text) {
39  SetFullText(text);
40  ExtractSuffixes();
41  CalculateFrequency();
42  CalculateSuffixEntropy();
43  ReleaseSuffixes();
44  ExtractPrefixes();
45  CalculatePrefixEntropy();
46  ReleasePrefixes();
47  ExtractWordCandidates();
48  CalculateCohesions();
49  SelectWords();
50  }
51 
52  void SetFullText(const string& fullText) {
53  utf8FullText = UTF8StringSlice(fullText.c_str());
54  }
55 
56  void SetFullText(const char* fullText) {
57  utf8FullText = UTF8StringSlice(fullText);
58  }
59 
60  void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
61 
62  void SetWordMinLength(const LengthType _wordMinLength) {
63  wordMinLength = _wordMinLength;
64  }
65 
66  void SetWordMaxLength(const LengthType _wordMaxLength) {
67  wordMaxLength = _wordMaxLength;
68  }
69 
70  void SetPrefixSetLength(const LengthType _prefixSetLength) {
71  prefixSetLength = _prefixSetLength;
72  }
73 
74  void SetSuffixSetLength(const LengthType _suffixSetLength) {
75  suffixSetLength = _suffixSetLength;
76  }
77 
78  // PreCalculationFilter is called after frequencies statistics.
79  void SetPreCalculationFilter(
80  const std::function<bool(const PhraseExtract&,
81  const UTF8StringSlice8Bit&)>& filter) {
82  preCalculationFilter = filter;
83  }
84 
85  void SetPostCalculationFilter(
86  const std::function<bool(const PhraseExtract&,
87  const UTF8StringSlice8Bit&)>& filter) {
88  postCalculationFilter = filter;
89  }
90 
91  void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
92 
93  void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
94 
95  const vector<UTF8StringSlice8Bit>& Words() const { return words; }
96 
97  const vector<UTF8StringSlice8Bit>& WordCandidates() const {
98  return wordCandidates;
99  }
100 
101  struct Signals {
102  size_t frequency;
103  double cohesion;
104  double suffixEntropy;
105  double prefixEntropy;
106  };
107 
108  const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
109 
110  double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
111 
112  double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
113 
114  double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
115 
116  double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
117 
118  size_t Frequency(const UTF8StringSlice8Bit& word) const;
119 
120  double Probability(const UTF8StringSlice8Bit& word) const;
121 
122  double LogProbability(const UTF8StringSlice8Bit& word) const;
123 
124  void Reset();
125 
126  void ExtractSuffixes();
127 
128  void ExtractPrefixes();
129 
130  void ExtractWordCandidates();
131 
132  void CalculateFrequency();
133 
134  void CalculateCohesions();
135 
136  void CalculateSuffixEntropy();
137 
138  void CalculatePrefixEntropy();
139 
140  void SelectWords();
141 
142  static bool
143  DefaultPreCalculationFilter(const PhraseExtract&,
145 
146  static bool
147  DefaultPostCalculationFilter(const PhraseExtract&,
149 
150 private:
151  class DictType;
152 
153  // Pointwise Mutual Information
154  double PMI(const UTF8StringSlice8Bit& wordCandidate,
155  const UTF8StringSlice8Bit& part1,
156  const UTF8StringSlice8Bit& part2) const;
157 
158  double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
159 
160  double CalculateEntropy(
161  const std::unordered_map<UTF8StringSlice8Bit, size_t,
162  UTF8StringSlice8Bit::Hasher>& choices) const;
163 
164  LengthType wordMinLength;
165  LengthType wordMaxLength;
166  LengthType prefixSetLength;
167  LengthType suffixSetLength;
168  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
169  preCalculationFilter;
170  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
171  postCalculationFilter;
172 
173  bool prefixesExtracted;
174  bool suffixesExtracted;
175  bool frequenciesCalculated;
176  bool wordCandidatesExtracted;
177  bool cohesionsCalculated;
178  bool prefixEntropiesCalculated;
179  bool suffixEntropiesCalculated;
180  bool wordsSelected;
181 
182  UTF8StringSlice utf8FullText;
183  size_t totalOccurrence;
184  double logTotalOccurrence;
185  vector<UTF8StringSlice8Bit> prefixes;
186  vector<UTF8StringSlice8Bit> suffixes;
187  vector<UTF8StringSlice8Bit> wordCandidates;
188  vector<UTF8StringSlice8Bit> words;
189  DictType* signals;
190 
191  friend class PhraseExtractTest;
192 };
193 
194 } // namespace opencc
opencc::UTF8StringSliceBase::Hasher
Definition: UTF8StringSlice.hpp:200
opencc::PhraseExtract::Signals
Definition: PhraseExtract.hpp:101
opencc::UTF8StringSliceBase
Definition: UTF8StringSlice.hpp:52
opencc::PhraseExtract
Definition: PhraseExtract.hpp:28