1 /**
2    Copyright: Martin Nowak 2013-.
3    License: MIT License, see LICENSE
4    Authors: $(WEB code.dawg.eu, Martin Nowak)
5 */
6 module hyphenate;
7 
8 import std.algorithm, std.conv, std.range;
9 import std.ascii : isDigit;
10 import std.uni : toLower;
11 import std.stdio, std.traits;
12 import core.bitop;
13 
14 /**
15    Compressed representation for short integral arrays.
16  */
17 struct BitArray
18 {
19     ///
20     this(R)(in R data) if (isIntegral!(ElementType!R) && isUnsigned!(ElementType!R))
21     {
22         _bits = encode(data);
23     }
24 
25     ///
26     @property bool empty() const
27     {
28         return _bits == fillBits;
29     }
30 
31     ///
32     @property ubyte front() const
33     in { assert(!empty); }
34     body
35     {
36         return cast(ubyte)((!!(_bits & 1) ? bsf(~_bits) : bsf(_bits)) - 1);
37     }
38 
39     ///
40     void popFront()
41     {
42         immutable shift = front + 1;
43         _bits = _bits >> shift | fillBits << 32 - shift;
44     }
45 
46 private:
47     uint encode(R)(in R data)
48     in { assert(reduce!"a+b"(0, data) + data.length < 32, data.to!string()); }
49     body
50     {
51         uint res = fillBits;
52         size_t i;
53         foreach (val; data.retro())
54         {
55             res <<= val + 1;
56             if (i++ & 1) res |= (1 << val + 1) - 1;
57         }
58         return res;
59     }
60 
61     enum fillBits = uint.max;
62     uint _bits = fillBits;
63 }
64 
65 unittest
66 {
67     foreach (val; BitArray([14u, 15u]))
68     {}
69 
70     foreach (data; [[0u], [0u, 1u, 0u], [30u], [14u, 15u], [0u, 13u, 13u, 0u, 0u]])
71         assert(equal(BitArray(data), data));
72 }
73 
74 alias Priorities = BitArray;
75 
76 @property auto letters(string s) { return s.filter!(a => !a.isDigit())(); }
77 @property Priorities priorities(R)(R r)
78 {
79     ubyte[20] buf = void;
80     size_t pos = 0;
81     while (!r.empty)
82     {
83         immutable c = r.front; r.popFront();
84         if (c.isDigit())
85         {
86             buf[pos++] = cast(ubyte)(c - '0');
87             if (!r.empty) r.popFront();
88         }
89         else
90         {
91             buf[pos++] = 0;
92         }
93     }
94     while (pos && buf[pos-1] == 0)
95         --pos;
96     return Priorities(buf[0 .. pos]);
97 }
98 
99 ///
100 unittest
101 {
102     enum testcases = [
103         "a1bc3d4" : [0, 1, 0, 3, 4],
104         "to2gr" : [0, 0, 2],
105         "1to" : [1],
106         "x3c2" : [0, 3, 2],
107         "1a2b3c4" : [1, 2, 3, 4],
108     ];
109     foreach (pat, prios; testcases)
110         assert(equal(priorities(pat), prios));
111 }
112 
113 // HACK: core.bitop.bt is not always inlined
114 bool bt(in uint* p, size_t bitnum) pure nothrow
115 {
116     return !!(p[bitnum >> 5] & 1 << (bitnum & 31));
117 }
118 
119 struct Trie
120 {
121     this(dchar c)
122     {
123         debug _c = c;
124     }
125 
126     static struct Table
127     {
128         inout(Trie)* opBinaryRight(string op : "in")(dchar c) inout
129         {
130             if (c >= 128 || !bt(bitmask.ptr, c))
131                 return null;
132             return &entries[getPos(c)-1];
133         }
134 
135         ref Trie getLvalue(dchar c)
136         {
137             if (c >= 128) assert(0);
138             auto npos = getPos(c);
139             if (!bts(bitmaskPtr, c))
140                 entries.insertInPlace(npos++, Trie(c));
141             return entries[npos-1];
142         }
143 
144         private size_t getPos(dchar c) const
145         {
146             immutable nbyte = c / 32; c &= 31;
147             size_t npos;
148             foreach (i; 0 .. nbyte)
149                 npos += _popcnt(bitmask[i]);
150             npos += _popcnt(bitmask[nbyte] & (1 << c | (1 << c) - 1));
151             return npos;
152         }
153 
154         private @property inout(size_t)* bitmaskPtr() inout
155         {
156             return cast(inout(size_t)*)bitmask.ptr;
157         }
158 
159         version (DigitalMars) private static int asmPopCnt(uint val) pure
160         {
161             static if (__VERSION__ > 2066)
162                 enum pure_ = " pure";
163             else
164                 enum pure_ = "";
165 
166             version (D_InlineAsm_X86)
167                 mixin("asm"~pure_~" { naked; popcnt EAX, EAX; ret; }");
168             else version (D_InlineAsm_X86_64)
169                 mixin("asm"~pure_~" { naked; popcnt EAX, EDI; ret; }");
170             else
171                 assert(0);
172         }
173 
174         private static immutable int function(uint) pure _popcnt;
175 
176         shared static this()
177         {
178             import core.cpuid;
179             static if (is(typeof(core.bitop.popcnt!()(0))))
180                 _popcnt = &core.bitop.popcnt!();
181             else
182                 _popcnt = &core.bitop.popcnt;
183             version (DigitalMars) if (hasPopcnt)
184                 _popcnt = &asmPopCnt;
185         }
186 
187         uint[4] bitmask;
188         Trie[] entries;
189     }
190     debug dchar _c;
191     Priorities priorities;
192     version (none)
193         Trie[dchar] elems;
194     else
195         Table elems;
196 }
197 
198 /**
199    Hyphenator is used to build the pattern tries.
200  */
201 struct Hyphenator
202 {
203     /// initialize with the content of a Tex pattern file
204     this(string s)
205     {
206         auto lines = s.splitter("\n");
207         lines = lines.find!(a => a.startsWith(`\patterns{`))();
208         lines.popFront();
209         foreach (line; refRange(&lines).until!(a => a.startsWith("}"))())
210         {
211             insertPattern(line);
212         }
213         assert(lines.front.startsWith("}"));
214         lines.popFront();
215         assert(lines.front.startsWith(`\hyphenation{`));
216         lines.popFront();
217         foreach (line; refRange(&lines).until!(a => a.startsWith("}"))())
218         {
219             insertException(line);
220         }
221         assert(lines.front.startsWith("}"));
222         lines.popFront();
223         assert(lines.front.empty);
224         lines.popFront();
225         assert(lines.empty);
226     }
227 
228     /// hyphenate $(PARAM word) with $(PARAM hyphen)
229     string hyphenate(const(char)[] word, const(char)[] hyphen) const
230     {
231         string ret;
232         hyphenate(word, hyphen, (s) { ret ~= s; });
233         return ret;
234     }
235 
236     /// hyphenate $(PARAM word) with $(PARAM hyphen) and output the result to $(PARAM sink)
237     void hyphenate(const(char)[] word, const(char)[] hyphen, scope void delegate(in char[]) sink) const
238     {
239         if (word.length <= 3) return sink(word);
240 
241         static ubyte[] buf;
242         static Appender!(char[]) app;
243         app.put(word.map!toLower());
244 
245         const(ubyte)[] prios;
246         if (auto p = app.data in exceptions)
247             prios = *p;
248         else
249             prios = buildPrios(app.data, buf);
250 
251         app.clear();
252 
253         assert(prios.length == word.length-1);
254         app.put(word.front);
255         word.popFront();
256         foreach (c, prio; zip(word, prios))
257         {
258             if (prio & 1) app.put(hyphen);
259             app.put(c);
260         }
261         sink(app.data);
262         app.clear();
263     }
264 
265 private:
266     ubyte[] buildPrios(in char[] word, ref ubyte[] buf) const
267     {
268         auto search = chain(".", word, ".");
269         if (buf.length < word.length + 3)
270         {
271             assumeSafeAppend(buf);
272             buf.length = word.length + 3;
273         }
274         buf[] = 0;
275         for (size_t pos; !search.empty; ++pos, search.popFront())
276         {
277             auto p = &root;
278             foreach (c; search)
279             {
280                 if ((p = c in p.elems) is null) break;
281                 size_t off;
282                 foreach (prio; cast()p.priorities)
283                 {
284                     buf[pos + off] = max(buf[pos + off], prio);
285                     ++off;
286                 }
287             }
288         }
289         // trim priorities before and after leading '.'
290         // trim priorities before and after trailing '.'
291         auto slice = buf[2..2+word.length-1];
292         // no hyphens after first or before last letter
293         slice[0] = slice[$-1] = 0;
294         return slice;
295     }
296 
297     Leave findLeave(R)(R rng)
298     {
299         return root.getLeave(rng, false);
300     }
301 
302     void insertPattern(R)(R rng)
303     {
304         auto p = &getTerminal(rng.letters);
305         *p = rng.priorities;
306     }
307 
308     private ref Priorities getTerminal(R)(R rng)
309     {
310         auto p = &root;
311         foreach (c; rng)
312             p = &p.elems.getLvalue(c);
313         return p.priorities;
314     }
315 
316     void insertException(string s)
317     {
318         auto prios = exceptionPriorities(s);
319         s = s.filter!(a => a != '-')().to!string();
320         exceptions[s] = prios;
321     }
322 
323     static immutable(ubyte)[] exceptionPriorities(string s)
324     {
325         typeof(return) prios;
326         for (s.popFront(); !s.empty; s.popFront())
327         {
328             if (s.front == '-')
329                 prios ~= 1, s.popFront();
330             else
331                 prios ~= 0;
332         }
333         return prios;
334     }
335 
336     unittest
337     {
338         assert(exceptionPriorities("as-so-ciate") == [0, 1, 0, 1, 0, 0, 0, 0]);
339     }
340 
341     immutable(ubyte)[][string] exceptions;
342     Trie root;
343 }
344 
345 unittest
346 {
347     import std.file : readText;
348 
349     auto h = Hyphenator(readText("patterns/hyphen.tex"));
350     assert(h.hyphenate("hyphenate", "-") == "hy-phen-ate");
351     assert(h.hyphenate("patterns", "-") == "pat-terns");
352     assert(h.hyphenate("foobar", "|") == "foo|bar");
353     assert(h.hyphenate("obligatory", "-") == "oblig-a-tory");
354 }
355 
356 unittest
357 {
358     import std.array : appender;
359     import std.file : readText;
360 
361     auto h = Hyphenator(readText("patterns/hyphen.tex"));
362     auto app = appender!(char[]);
363     h.hyphenate("hyphenate", "-", s => app.put(s));
364     assert(app.data == "hy-phen-ate");
365 }