1 /** 2 Copyright: Martin Nowak 2013-. 3 License: MIT License, see LICENSE 4 Authors: $(WEB code.dawg.eu, Martin Nowak) 5 */ 6 module hyphenate; 7 8 import std.algorithm, std.conv, std.range; 9 import std.ascii : isDigit; 10 import std.uni : toLower; 11 import std.stdio, std.traits; 12 import core.bitop; 13 14 /** 15 Compressed representation for short integral arrays. 16 */ 17 struct BitArray 18 { 19 /// 20 this(R)(in R data) if (isIntegral!(ElementType!R) && isUnsigned!(ElementType!R)) 21 { 22 _bits = encode(data); 23 } 24 25 /// 26 @property bool empty() const 27 { 28 return _bits == fillBits; 29 } 30 31 /// 32 @property ubyte front() const 33 in { assert(!empty); } 34 body 35 { 36 return cast(ubyte)((!!(_bits & 1) ? bsf(~_bits) : bsf(_bits)) - 1); 37 } 38 39 /// 40 void popFront() 41 { 42 immutable shift = front + 1; 43 _bits = _bits >> shift | fillBits << 32 - shift; 44 } 45 46 private: 47 uint encode(R)(in R data) 48 in { assert(reduce!"a+b"(0, data) + data.length < 32, data.to!string()); } 49 body 50 { 51 uint res = fillBits; 52 size_t i; 53 foreach (val; data.retro()) 54 { 55 res <<= val + 1; 56 if (i++ & 1) res |= (1 << val + 1) - 1; 57 } 58 return res; 59 } 60 61 enum fillBits = uint.max; 62 uint _bits = fillBits; 63 } 64 65 unittest 66 { 67 foreach (val; BitArray([14u, 15u])) 68 {} 69 70 foreach (data; [[0u], [0u, 1u, 0u], [30u], [14u, 15u], [0u, 13u, 13u, 0u, 0u]]) 71 assert(equal(BitArray(data), data)); 72 } 73 74 alias Priorities = BitArray; 75 76 @property auto letters(string s) { return s.filter!(a => !a.isDigit())(); } 77 @property Priorities priorities(R)(R r) 78 { 79 ubyte[20] buf = void; 80 size_t pos = 0; 81 while (!r.empty) 82 { 83 immutable c = r.front; r.popFront(); 84 if (c.isDigit()) 85 { 86 buf[pos++] = cast(ubyte)(c - '0'); 87 if (!r.empty) r.popFront(); 88 } 89 else 90 { 91 buf[pos++] = 0; 92 } 93 } 94 while (pos && buf[pos-1] == 0) 95 --pos; 96 return Priorities(buf[0 .. pos]); 97 } 98 99 /// 100 unittest 101 { 102 enum testcases = [ 103 "a1bc3d4" : [0, 1, 0, 3, 4], 104 "to2gr" : [0, 0, 2], 105 "1to" : [1], 106 "x3c2" : [0, 3, 2], 107 "1a2b3c4" : [1, 2, 3, 4], 108 ]; 109 foreach (pat, prios; testcases) 110 assert(equal(priorities(pat), prios)); 111 } 112 113 // HACK: core.bitop.bt is not always inlined 114 bool bt(in uint* p, size_t bitnum) pure nothrow 115 { 116 return !!(p[bitnum >> 5] & 1 << (bitnum & 31)); 117 } 118 119 struct Trie 120 { 121 this(dchar c) 122 { 123 debug _c = c; 124 } 125 126 static struct Table 127 { 128 inout(Trie)* opBinaryRight(string op : "in")(dchar c) inout 129 { 130 if (c >= 128 || !bt(bitmask.ptr, c)) 131 return null; 132 return &entries[getPos(c)-1]; 133 } 134 135 ref Trie getLvalue(dchar c) 136 { 137 if (c >= 128) assert(0); 138 auto npos = getPos(c); 139 if (!bts(bitmaskPtr, c)) 140 entries.insertInPlace(npos++, Trie(c)); 141 return entries[npos-1]; 142 } 143 144 private size_t getPos(dchar c) const 145 { 146 immutable nbyte = c / 32; c &= 31; 147 size_t npos; 148 foreach (i; 0 .. nbyte) 149 npos += _popcnt(bitmask[i]); 150 npos += _popcnt(bitmask[nbyte] & (1 << c | (1 << c) - 1)); 151 return npos; 152 } 153 154 private @property inout(size_t)* bitmaskPtr() inout 155 { 156 return cast(inout(size_t)*)bitmask.ptr; 157 } 158 159 version (DigitalMars) private static int asmPopCnt(uint val) pure 160 { 161 static if (__VERSION__ > 2066) 162 enum pure_ = " pure"; 163 else 164 enum pure_ = ""; 165 166 version (D_InlineAsm_X86) 167 mixin("asm"~pure_~" { naked; popcnt EAX, EAX; ret; }"); 168 else version (D_InlineAsm_X86_64) 169 mixin("asm"~pure_~" { naked; popcnt EAX, EDI; ret; }"); 170 else 171 assert(0); 172 } 173 174 private static immutable int function(uint) pure _popcnt; 175 176 shared static this() 177 { 178 import core.cpuid; 179 static if (is(typeof(core.bitop.popcnt!()(0)))) 180 _popcnt = &core.bitop.popcnt!(); 181 else 182 _popcnt = &core.bitop.popcnt; 183 version (DigitalMars) if (hasPopcnt) 184 _popcnt = &asmPopCnt; 185 } 186 187 uint[4] bitmask; 188 Trie[] entries; 189 } 190 debug dchar _c; 191 Priorities priorities; 192 version (none) 193 Trie[dchar] elems; 194 else 195 Table elems; 196 } 197 198 /** 199 Hyphenator is used to build the pattern tries. 200 */ 201 struct Hyphenator 202 { 203 /// initialize with the content of a Tex pattern file 204 this(string s) 205 { 206 auto lines = s.splitter("\n"); 207 lines = lines.find!(a => a.startsWith(`\patterns{`))(); 208 lines.popFront(); 209 foreach (line; refRange(&lines).until!(a => a.startsWith("}"))()) 210 { 211 insertPattern(line); 212 } 213 assert(lines.front.startsWith("}")); 214 lines.popFront(); 215 assert(lines.front.startsWith(`\hyphenation{`)); 216 lines.popFront(); 217 foreach (line; refRange(&lines).until!(a => a.startsWith("}"))()) 218 { 219 insertException(line); 220 } 221 assert(lines.front.startsWith("}")); 222 lines.popFront(); 223 assert(lines.front.empty); 224 lines.popFront(); 225 assert(lines.empty); 226 } 227 228 /// hyphenate $(PARAM word) with $(PARAM hyphen) 229 string hyphenate(const(char)[] word, const(char)[] hyphen) const 230 { 231 string ret; 232 hyphenate(word, hyphen, (s) { ret ~= s; }); 233 return ret; 234 } 235 236 /// hyphenate $(PARAM word) with $(PARAM hyphen) and output the result to $(PARAM sink) 237 void hyphenate(const(char)[] word, const(char)[] hyphen, scope void delegate(in char[]) sink) const 238 { 239 if (word.length <= 3) return sink(word); 240 241 static ubyte[] buf; 242 static Appender!(char[]) app; 243 app.put(word.map!toLower()); 244 245 const(ubyte)[] prios; 246 if (auto p = app.data in exceptions) 247 prios = *p; 248 else 249 prios = buildPrios(app.data, buf); 250 251 app.clear(); 252 253 assert(prios.length == word.length-1); 254 app.put(word.front); 255 word.popFront(); 256 foreach (c, prio; zip(word, prios)) 257 { 258 if (prio & 1) app.put(hyphen); 259 app.put(c); 260 } 261 sink(app.data); 262 app.clear(); 263 } 264 265 private: 266 ubyte[] buildPrios(in char[] word, ref ubyte[] buf) const 267 { 268 auto search = chain(".", word, "."); 269 if (buf.length < word.length + 3) 270 { 271 assumeSafeAppend(buf); 272 buf.length = word.length + 3; 273 } 274 buf[] = 0; 275 for (size_t pos; !search.empty; ++pos, search.popFront()) 276 { 277 auto p = &root; 278 foreach (c; search) 279 { 280 if ((p = c in p.elems) is null) break; 281 size_t off; 282 foreach (prio; cast()p.priorities) 283 { 284 buf[pos + off] = max(buf[pos + off], prio); 285 ++off; 286 } 287 } 288 } 289 // trim priorities before and after leading '.' 290 // trim priorities before and after trailing '.' 291 auto slice = buf[2..2+word.length-1]; 292 // no hyphens after first or before last letter 293 slice[0] = slice[$-1] = 0; 294 return slice; 295 } 296 297 Leave findLeave(R)(R rng) 298 { 299 return root.getLeave(rng, false); 300 } 301 302 void insertPattern(R)(R rng) 303 { 304 auto p = &getTerminal(rng.letters); 305 *p = rng.priorities; 306 } 307 308 private ref Priorities getTerminal(R)(R rng) 309 { 310 auto p = &root; 311 foreach (c; rng) 312 p = &p.elems.getLvalue(c); 313 return p.priorities; 314 } 315 316 void insertException(string s) 317 { 318 auto prios = exceptionPriorities(s); 319 s = s.filter!(a => a != '-')().to!string(); 320 exceptions[s] = prios; 321 } 322 323 static immutable(ubyte)[] exceptionPriorities(string s) 324 { 325 typeof(return) prios; 326 for (s.popFront(); !s.empty; s.popFront()) 327 { 328 if (s.front == '-') 329 prios ~= 1, s.popFront(); 330 else 331 prios ~= 0; 332 } 333 return prios; 334 } 335 336 unittest 337 { 338 assert(exceptionPriorities("as-so-ciate") == [0, 1, 0, 1, 0, 0, 0, 0]); 339 } 340 341 immutable(ubyte)[][string] exceptions; 342 Trie root; 343 } 344 345 unittest 346 { 347 import std.file : readText; 348 349 auto h = Hyphenator(readText("patterns/hyphen.tex")); 350 assert(h.hyphenate("hyphenate", "-") == "hy-phen-ate"); 351 assert(h.hyphenate("patterns", "-") == "pat-terns"); 352 assert(h.hyphenate("foobar", "|") == "foo|bar"); 353 assert(h.hyphenate("obligatory", "-") == "oblig-a-tory"); 354 } 355 356 unittest 357 { 358 import std.array : appender; 359 import std.file : readText; 360 361 auto h = Hyphenator(readText("patterns/hyphen.tex")); 362 auto app = appender!(char[]); 363 h.hyphenate("hyphenate", "-", s => app.put(s)); 364 assert(app.data == "hy-phen-ate"); 365 }