1 /** 2 Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange, 3 and a several others. 4 5 Utilities in this file: 6 $(LIST 7 * [InputFieldReordering] - A class that creates a reordered subset of fields from 8 an input line. Fields in the subset are accessed by array indicies. This is 9 especially useful when processing the subset in a specific order, such as the 10 order listed on the command-line at run-time. 11 12 * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer 13 output. Intended for use with stdout, it is a significant performance benefit. 14 15 * [bufferedByLine] - An input range that reads from a File handle line by line. 16 It is similar to the standard library method std.stdio.File.byLine, but quite a 17 bit faster. This is achieved by reading in larger blocks and buffering. 18 19 * [InputSourceRange] - An input range that provides open file access to a set of 20 files. It is used to iterate over files passed as command line arguments. This 21 enable reading header line of a file during command line argument process, then 22 passing the open file to the main processing functions. 23 24 * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides 25 access to a byLine iterator (bufferedByLine) rather than an open file. This is 26 used by tools that run the same processing logic both header non-header lines. 27 28 * [joinAppend] - A function that performs a join, but appending the join output to 29 an output stream. It is a performance improvement over using join or joiner with 30 writeln. 31 32 * [getTsvFieldValue] - A convenience function when only a single value is needed from 33 an input line. 34 35 * Field-lists: [parseFieldList], [makeFieldListOptionHandler] - Helper functions for 36 parsing field-lists entered on the command line. 37 38 * [throwIfWindowsNewlineOnUnix] - A utility for Unix platform builds to detecting 39 Windows newlines in input. 40 ) 41 42 Copyright (c) 2015-2020, eBay Inc. 43 Initially written by Jon Degenhardt 44 45 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 46 */ 47 48 module tsv_utils.common.utils; 49 50 import std.range; 51 import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned; 52 import std.typecons : Flag, No, Yes; 53 54 // InputFieldReording class. 55 56 /** Flag used by the InputFieldReordering template. */ 57 alias EnablePartialLines = Flag!"enablePartialLines"; 58 59 /** 60 InputFieldReordering - Move select fields from an input line to an output array, 61 reordering along the way. 62 63 The InputFieldReordering class is used to reorder a subset of fields from an input line. 64 The caller instantiates an InputFieldReordering object at the start of input processing. 65 The instance contains a mapping from input index to output index, plus a buffer holding 66 the reordered fields. The caller processes each input line by calling initNewLine, 67 splitting the line into fields, and calling processNextField on each field. The output 68 buffer is ready when the allFieldsFilled method returns true. 69 70 Fields are not copied, instead the output buffer points to the fields passed by the caller. 71 The caller needs to use or copy the output buffer while the fields are still valid, which 72 is normally until reading the next input line. The program below illustrates the basic use 73 case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend, 74 below, which has a performance improvement over join used here.) 75 76 --- 77 int main(string[] args) 78 { 79 import tsv_utils.common.utils; 80 import std.algorithm, std.array, std.range, std.stdio; 81 size_t[] fieldIndicies = [3, 0, 2]; 82 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 83 foreach (line; stdin.byLine) 84 { 85 fieldReordering.initNewLine; 86 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 87 { 88 fieldReordering.processNextField(fieldIndex, fieldValue); 89 if (fieldReordering.allFieldsFilled) break; 90 } 91 if (fieldReordering.allFieldsFilled) 92 { 93 writeln(fieldReordering.outputFields.join('\t')); 94 } 95 else 96 { 97 writeln("Error: Insufficient number of field on the line."); 98 } 99 } 100 return 0; 101 } 102 --- 103 104 Field indicies are zero-based. An individual field can be listed multiple times. The 105 outputFields array is not valid until all the specified fields have been processed. The 106 allFieldsFilled method tests this. If a line does not have enough fields the outputFields 107 buffer cannot be used. For most TSV applications this is okay, as it means the line is 108 invalid and cannot be used. However, if partial lines are okay, the template can be 109 instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in 110 are empty strings in the outputFields return. 111 */ 112 final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no) 113 if (isSomeChar!C) 114 { 115 /* Implementation: The class works by creating an array of tuples mapping the input 116 * field index to the location in the outputFields array. The 'fromToMap' array is 117 * sorted in input field order, enabling placement in the outputFields buffer during a 118 * pass over the input fields. The map is created by the constructor. An example: 119 * 120 * inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9] 121 * fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>] 122 * 123 * During processing of an a line, an array slice, mapStack, is used to track how 124 * much of the fromToMap remains to be processed. 125 */ 126 import std.range; 127 import std.typecons : Tuple; 128 129 alias TupleFromTo = Tuple!(size_t, "from", size_t, "to"); 130 131 private C[][] outputFieldsBuf; 132 private TupleFromTo[] fromToMap; 133 private TupleFromTo[] mapStack; 134 135 final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe 136 { 137 import std.algorithm : sort; 138 139 outputFieldsBuf = new C[][](inputFieldIndicies.length); 140 fromToMap.reserve(inputFieldIndicies.length); 141 142 foreach (to, from; inputFieldIndicies.enumerate(start)) 143 { 144 fromToMap ~= TupleFromTo(from, to); 145 } 146 147 sort(fromToMap); 148 initNewLine; 149 } 150 151 /** initNewLine initializes the object for a new line. */ 152 final void initNewLine() pure nothrow @safe 153 { 154 mapStack = fromToMap; 155 static if (partialLinesOk) 156 { 157 import std.algorithm : each; 158 outputFieldsBuf.each!((ref s) => s.length = 0); 159 } 160 } 161 162 /** processNextField maps an input field to the correct locations in the 163 * outputFields array. 164 * 165 * processNextField should be called once for each field on the line, in the order 166 * found. The processing of the line can terminate once allFieldsFilled returns 167 * true. 168 * 169 * The return value is the number of output fields the input field maps to. Zero 170 * means the field is not mapped to the output fields array. 171 * 172 * If, prior to allFieldsProcessed returning true, any fields on the input line 173 * are not passed to processNextField, the caller should either ensure the fields 174 * are not part of the output fields or have partial lines enabled. 175 */ 176 final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc 177 { 178 size_t numFilled = 0; 179 while (!mapStack.empty && fieldIndex == mapStack.front.from) 180 { 181 outputFieldsBuf[mapStack.front.to] = fieldValue; 182 mapStack.popFront; 183 numFilled++; 184 } 185 return numFilled; 186 } 187 188 /** allFieldsFilled returned true if all fields expected have been processed. */ 189 final bool allFieldsFilled() const pure nothrow @safe @nogc 190 { 191 return mapStack.empty; 192 } 193 194 /** outputFields is the assembled output fields. Unless partial lines are enabled, 195 * it is only valid after allFieldsFilled is true. 196 */ 197 final C[][] outputFields() pure nothrow @safe @nogc 198 { 199 return outputFieldsBuf[]; 200 } 201 } 202 203 // InputFieldReordering - Tests using different character types. 204 @safe unittest 205 { 206 import std.conv : to; 207 208 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 209 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 210 ["r3f0", "123", "456", "789"]]; 211 212 size_t[] fields_2_0 = [2, 0]; 213 214 auto expected_2_0 = [["r1f2", "r1f0"], 215 ["ÀBCßßZ", "r2f0"], 216 ["456", "r3f0"]]; 217 218 char[][][] charExpected_2_0 = to!(char[][][])(expected_2_0); 219 wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0); 220 dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0); 221 dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0); 222 223 auto charIFR = new InputFieldReordering!char(fields_2_0); 224 auto wcharIFR = new InputFieldReordering!wchar(fields_2_0); 225 auto dcharIFR = new InputFieldReordering!dchar(fields_2_0); 226 227 foreach (lineIndex, line; inputLines) 228 { 229 charIFR.initNewLine; 230 wcharIFR.initNewLine; 231 dcharIFR.initNewLine; 232 233 foreach (fieldIndex, fieldValue; line) 234 { 235 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 236 wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue)); 237 dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue)); 238 239 assert ((fieldIndex >= 2) == charIFR.allFieldsFilled); 240 assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled); 241 assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled); 242 } 243 assert(charIFR.allFieldsFilled); 244 assert(wcharIFR.allFieldsFilled); 245 assert(dcharIFR.allFieldsFilled); 246 247 assert(charIFR.outputFields == charExpected_2_0[lineIndex]); 248 assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]); 249 assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]); 250 } 251 } 252 253 // InputFieldReordering - Test of partial line support. 254 @safe unittest 255 { 256 import std.conv : to; 257 258 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 259 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 260 ["r3f0", "123", "456", "789"]]; 261 262 size_t[] fields_2_0 = [2, 0]; 263 264 // The expected states of the output field while each line and field are processed. 265 auto expectedBylineByfield_2_0 = 266 [ 267 [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"], ["r1f2", "r1f0"]], 268 [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]], 269 [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"], ["456", "r3f0"]], 270 ]; 271 272 char[][][][] charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0); 273 274 auto charIFR = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0); 275 276 foreach (lineIndex, line; inputLines) 277 { 278 charIFR.initNewLine; 279 foreach (fieldIndex, fieldValue; line) 280 { 281 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 282 assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]); 283 } 284 } 285 } 286 287 // InputFieldReordering - Field combination tests. 288 @safe unittest 289 { 290 import std.conv : to; 291 import std.stdio; 292 293 auto inputLines = [["00", "01", "02", "03"], 294 ["10", "11", "12", "13"], 295 ["20", "21", "22", "23"]]; 296 297 size_t[] fields_0 = [0]; 298 size_t[] fields_3 = [3]; 299 size_t[] fields_01 = [0, 1]; 300 size_t[] fields_10 = [1, 0]; 301 size_t[] fields_03 = [0, 3]; 302 size_t[] fields_30 = [3, 0]; 303 size_t[] fields_0123 = [0, 1, 2, 3]; 304 size_t[] fields_3210 = [3, 2, 1, 0]; 305 size_t[] fields_03001 = [0, 3, 0, 0, 1]; 306 307 auto expected_0 = to!(char[][][])([["00"], 308 ["10"], 309 ["20"]]); 310 311 auto expected_3 = to!(char[][][])([["03"], 312 ["13"], 313 ["23"]]); 314 315 auto expected_01 = to!(char[][][])([["00", "01"], 316 ["10", "11"], 317 ["20", "21"]]); 318 319 auto expected_10 = to!(char[][][])([["01", "00"], 320 ["11", "10"], 321 ["21", "20"]]); 322 323 auto expected_03 = to!(char[][][])([["00", "03"], 324 ["10", "13"], 325 ["20", "23"]]); 326 327 auto expected_30 = to!(char[][][])([["03", "00"], 328 ["13", "10"], 329 ["23", "20"]]); 330 331 auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"], 332 ["10", "11", "12", "13"], 333 ["20", "21", "22", "23"]]); 334 335 auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"], 336 ["13", "12", "11", "10"], 337 ["23", "22", "21", "20"]]); 338 339 auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"], 340 ["10", "13", "10", "10", "11"], 341 ["20", "23", "20", "20", "21"]]); 342 343 auto ifr_0 = new InputFieldReordering!char(fields_0); 344 auto ifr_3 = new InputFieldReordering!char(fields_3); 345 auto ifr_01 = new InputFieldReordering!char(fields_01); 346 auto ifr_10 = new InputFieldReordering!char(fields_10); 347 auto ifr_03 = new InputFieldReordering!char(fields_03); 348 auto ifr_30 = new InputFieldReordering!char(fields_30); 349 auto ifr_0123 = new InputFieldReordering!char(fields_0123); 350 auto ifr_3210 = new InputFieldReordering!char(fields_3210); 351 auto ifr_03001 = new InputFieldReordering!char(fields_03001); 352 353 foreach (lineIndex, line; inputLines) 354 { 355 ifr_0.initNewLine; 356 ifr_3.initNewLine; 357 ifr_01.initNewLine; 358 ifr_10.initNewLine; 359 ifr_03.initNewLine; 360 ifr_30.initNewLine; 361 ifr_0123.initNewLine; 362 ifr_3210.initNewLine; 363 ifr_03001.initNewLine; 364 365 foreach (fieldIndex, fieldValue; line) 366 { 367 ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue)); 368 ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue)); 369 ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue)); 370 ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue)); 371 ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue)); 372 ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue)); 373 ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue)); 374 ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue)); 375 ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue)); 376 } 377 378 assert(ifr_0.outputFields == expected_0[lineIndex]); 379 assert(ifr_3.outputFields == expected_3[lineIndex]); 380 assert(ifr_01.outputFields == expected_01[lineIndex]); 381 assert(ifr_10.outputFields == expected_10[lineIndex]); 382 assert(ifr_03.outputFields == expected_03[lineIndex]); 383 assert(ifr_30.outputFields == expected_30[lineIndex]); 384 assert(ifr_0123.outputFields == expected_0123[lineIndex]); 385 assert(ifr_3210.outputFields == expected_3210[lineIndex]); 386 assert(ifr_03001.outputFields == expected_03001[lineIndex]); 387 } 388 } 389 390 391 import std.stdio : File, isFileHandle, KeepTerminator; 392 import std.range : isOutputRange; 393 import std.traits : Unqual; 394 395 /** 396 BufferedOutputRange is a performance enhancement over writing directly to an output 397 stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an 398 internal buffer and written to the output stream as a block. 399 400 Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster 401 than writing to stdout directly. This is especially noticable for outputs with short 402 lines, as it blocks many writes together in a single write. 403 404 The internal buffer is written to the output stream after flushSize has been reached. 405 This is checked at newline boundaries, when appendln is called or when put is called 406 with a single newline character. Other writes check maxSize, which is used to avoid 407 runaway buffers. 408 409 410 BufferedOutputRange has a put method allowing it to be used a range. It has a number 411 of other methods providing additional control. 412 413 $(LIST 414 * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the 415 output stream, e.g. stdout. Other arguments are optional, defaults normally suffice. 416 417 * `append(stuff)` - Append to the internal buffer. 418 419 * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer 420 is flushed to the output stream if is has reached flushSize. 421 422 * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the 423 output stream if is has reached flushSize. 424 425 * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`. 426 For reasons that are not clear, joiner is quite slow. 427 428 * `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been 429 reached. 430 431 * `flush()` - Write the internal buffer to the output stream. 432 433 * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single 434 newline character, '\n' or "\n". 435 ) 436 437 The internal buffer is automatically flushed when the BufferedOutputRange goes out of 438 scope. 439 */ 440 struct BufferedOutputRange(OutputTarget) 441 if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char)) 442 { 443 import std.range : isOutputRange; 444 import std.array : appender; 445 import std.format : format; 446 447 /* Identify the output element type. Only supporting char and ubyte for now. */ 448 static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char)) 449 { 450 alias C = char; 451 } 452 else static if (isOutputRange!(OutputTarget, ubyte)) 453 { 454 alias C = ubyte; 455 } 456 else static assert(false); 457 458 private enum defaultReserveSize = 11264; 459 private enum defaultFlushSize = 10240; 460 private enum defaultMaxSize = 4194304; 461 462 private OutputTarget _outputTarget; 463 private auto _outputBuffer = appender!(C[]); 464 private immutable size_t _flushSize; 465 private immutable size_t _maxSize; 466 467 this(OutputTarget outputTarget, 468 size_t flushSize = defaultFlushSize, 469 size_t reserveSize = defaultReserveSize, 470 size_t maxSize = defaultMaxSize) 471 { 472 assert(flushSize <= maxSize); 473 474 _outputTarget = outputTarget; 475 _flushSize = flushSize; 476 _maxSize = (flushSize <= maxSize) ? maxSize : flushSize; 477 _outputBuffer.reserve(reserveSize); 478 } 479 480 ~this() 481 { 482 flush(); 483 } 484 485 void flush() 486 { 487 static if (isFileHandle!OutputTarget) _outputTarget.write(_outputBuffer.data); 488 else _outputTarget.put(_outputBuffer.data); 489 490 _outputBuffer.clear; 491 } 492 493 bool flushIfFull() 494 { 495 bool isFull = _outputBuffer.data.length >= _flushSize; 496 if (isFull) flush(); 497 return isFull; 498 } 499 500 /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */ 501 void flushIfMaxSize() 502 { 503 if (_outputBuffer.data.length >= _maxSize) flush(); 504 } 505 506 /* maybeFlush is intended for the case where put is called with a trailing newline. 507 * 508 * Flushing occurs if the buffer has a trailing newline and has reached flush size. 509 * Flushing also occurs if the buffer has reached max size. 510 */ 511 private bool maybeFlush() 512 { 513 immutable bool doFlush = 514 _outputBuffer.data.length >= _flushSize && 515 (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize); 516 517 if (doFlush) flush(); 518 return doFlush; 519 } 520 521 522 private void appendRaw(T)(T stuff) pure 523 { 524 import std.range : rangePut = put; 525 rangePut(_outputBuffer, stuff); 526 } 527 528 void append(T)(T stuff) 529 { 530 appendRaw(stuff); 531 maybeFlush(); 532 } 533 534 bool appendln() 535 { 536 appendRaw('\n'); 537 return flushIfFull(); 538 } 539 540 bool appendln(T)(T stuff) 541 { 542 appendRaw(stuff); 543 return appendln(); 544 } 545 546 /* joinAppend is an optimization of append(inputRange.joiner(delimiter). 547 * This form is quite a bit faster, 40%+ on some benchmarks. 548 */ 549 void joinAppend(InputRange, E)(InputRange inputRange, E delimiter) 550 if (isInputRange!InputRange && 551 is(ElementType!InputRange : const C[]) && 552 (is(E : const C[]) || is(E : const C))) 553 { 554 if (!inputRange.empty) 555 { 556 appendRaw(inputRange.front); 557 inputRange.popFront; 558 } 559 foreach (x; inputRange) 560 { 561 appendRaw(delimiter); 562 appendRaw(x); 563 } 564 flushIfMaxSize(); 565 } 566 567 /* Make this an output range. */ 568 void put(T)(T stuff) 569 { 570 import std.traits; 571 import std.stdio; 572 573 static if (isSomeChar!T) 574 { 575 if (stuff == '\n') appendln(); 576 else appendRaw(stuff); 577 } 578 else static if (isSomeString!T) 579 { 580 if (stuff == "\n") appendln(); 581 else append(stuff); 582 } 583 else append(stuff); 584 } 585 } 586 587 // BufferedOutputRange. 588 unittest 589 { 590 import tsv_utils.common.unittest_utils; 591 import std.file : rmdirRecurse, readText; 592 import std.path : buildPath; 593 594 auto testDir = makeUnittestTempDir("tsv_utils_buffered_output"); 595 scope(exit) testDir.rmdirRecurse; 596 597 import std.algorithm : map, joiner; 598 import std.range : iota; 599 import std.conv : to; 600 601 /* Basic test. Note that exiting the scope triggers flush. */ 602 string filepath1 = buildPath(testDir, "file1.txt"); 603 { 604 import std.stdio : File; 605 606 auto ostream = BufferedOutputRange!File(filepath1.File("w")); 607 ostream.append("file1: "); 608 ostream.append("abc"); 609 ostream.append(["def", "ghi", "jkl"]); 610 ostream.appendln(100.to!string); 611 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 612 ostream.appendln(); 613 } 614 assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 615 616 /* Test with no reserve and no flush at every line. */ 617 string filepath2 = buildPath(testDir, "file2.txt"); 618 { 619 import std.stdio : File; 620 621 auto ostream = BufferedOutputRange!File(filepath2.File("w"), 0, 0); 622 ostream.append("file2: "); 623 ostream.append("abc"); 624 ostream.append(["def", "ghi", "jkl"]); 625 ostream.appendln("100"); 626 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 627 ostream.appendln(); 628 } 629 assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 630 631 /* With a locking text writer. Requires version 2.078.0 632 See: https://issues.dlang.org/show_bug.cgi?id=9661 633 */ 634 static if (__VERSION__ >= 2078) 635 { 636 string filepath3 = buildPath(testDir, "file3.txt"); 637 { 638 import std.stdio : File; 639 640 auto ltw = filepath3.File("w").lockingTextWriter; 641 { 642 auto ostream = BufferedOutputRange!(typeof(ltw))(ltw); 643 ostream.append("file3: "); 644 ostream.append("abc"); 645 ostream.append(["def", "ghi", "jkl"]); 646 ostream.appendln("100"); 647 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 648 ostream.appendln(); 649 } 650 } 651 assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 652 } 653 654 /* With an Appender. */ 655 import std.array : appender; 656 auto app1 = appender!(char[]); 657 { 658 auto ostream = BufferedOutputRange!(typeof(app1))(app1); 659 ostream.append("appender1: "); 660 ostream.append("abc"); 661 ostream.append(["def", "ghi", "jkl"]); 662 ostream.appendln("100"); 663 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 664 ostream.appendln(); 665 } 666 assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 667 668 /* With an Appender, but checking flush boundaries. */ 669 auto app2 = appender!(char[]); 670 { 671 auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+ 672 bool wasFlushed = false; 673 674 assert(app2.data == ""); 675 676 ostream.append("12345678"); // Not flushed yet. 677 assert(app2.data == ""); 678 679 wasFlushed = ostream.appendln; // Nineth char, not flushed yet. 680 assert(!wasFlushed); 681 assert(app2.data == ""); 682 683 wasFlushed = ostream.appendln; // Tenth char, now flushed. 684 assert(wasFlushed); 685 assert(app2.data == "12345678\n\n"); 686 687 app2.clear; 688 assert(app2.data == ""); 689 690 ostream.append("12345678"); 691 692 wasFlushed = ostream.flushIfFull; 693 assert(!wasFlushed); 694 assert(app2.data == ""); 695 696 ostream.flush; 697 assert(app2.data == "12345678"); 698 699 app2.clear; 700 assert(app2.data == ""); 701 702 ostream.append("123456789012345"); 703 assert(app2.data == ""); 704 } 705 assert(app2.data == "123456789012345"); 706 707 /* Using joinAppend. */ 708 auto app1b = appender!(char[]); 709 { 710 auto ostream = BufferedOutputRange!(typeof(app1b))(app1b); 711 ostream.append("appenderB: "); 712 ostream.joinAppend(["a", "bc", "def"], '-'); 713 ostream.append(':'); 714 ostream.joinAppend(["g", "hi", "jkl"], '-'); 715 ostream.appendln("*100*"); 716 ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' '); 717 ostream.append(' '); 718 ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " "); 719 ostream.appendln(); 720 } 721 assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n", 722 "app1b.data: |" ~app1b.data ~ "|"); 723 724 /* Operating as an output range. When passed to a function as a ref, exiting 725 * the function does not flush. When passed as a value, it get flushed when 726 * the function returns. Also test both UCFS and non-UFCS styles. 727 */ 728 729 void outputStuffAsRef(T)(ref T range) 730 if (isOutputRange!(T, char)) 731 { 732 range.put('1'); 733 put(range, "23"); 734 range.put('\n'); 735 range.put(["5", "67"]); 736 put(range, iota(8, 10).map!(x => x.to!string)); 737 put(range, "\n"); 738 } 739 740 void outputStuffAsVal(T)(T range) 741 if (isOutputRange!(T, char)) 742 { 743 put(range, '1'); 744 range.put("23"); 745 put(range, '\n'); 746 put(range, ["5", "67"]); 747 range.put(iota(8, 10).map!(x => x.to!string)); 748 range.put("\n"); 749 } 750 751 auto app3 = appender!(char[]); 752 { 753 auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0); 754 outputStuffAsRef(ostream); 755 assert(app3.data == "", "app3.data: |" ~app3.data ~ "|"); 756 outputStuffAsRef(ostream); 757 assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|"); 758 } 759 assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|"); 760 761 auto app4 = appender!(char[]); 762 { 763 auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0); 764 outputStuffAsVal(ostream); 765 assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 766 outputStuffAsVal(ostream); 767 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 768 } 769 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 770 771 /* Test maxSize. */ 772 auto app5 = appender!(char[]); 773 { 774 auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10 775 assert(app5.data == ""); 776 777 ostream.append("1234567"); // Not flushed yet (no newline). 778 assert(app5.data == ""); 779 780 ostream.append("89012"); // Flushed by maxSize 781 assert(app5.data == "123456789012"); 782 783 ostream.put("1234567"); // Not flushed yet (no newline). 784 assert(app5.data == "123456789012"); 785 786 ostream.put("89012"); // Flushed by maxSize 787 assert(app5.data == "123456789012123456789012"); 788 789 ostream.joinAppend(["ab", "cd"], '-'); // Not flushed yet 790 ostream.joinAppend(["de", "gh", "ij"], '-'); // Flushed by maxSize 791 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 792 } 793 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 794 } 795 796 /** 797 bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by 798 reading a large buffer from the input stream rather than just a single line. 799 800 The file argument needs to be a File object open for reading, typically a filesystem 801 file or standard input. Use the Yes.keepTerminator template parameter to keep the 802 newline. This is similar to stdio.File.byLine, except specified as a template paramter 803 rather than a runtime parameter. 804 805 Reading in blocks does mean that input is not read until a full buffer is available or 806 end-of-file is reached. For this reason, bufferedByLine is not appropriate for 807 interactive input. 808 */ 809 810 auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char, 811 ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16) 812 (File file) 813 if (is(Char == char) || is(Char == ubyte)) 814 { 815 static assert(0 < growSize && growSize <= readSize); 816 817 static final class BufferedByLineImpl 818 { 819 /* Buffer state variables 820 * - _buffer.length - Full length of allocated buffer. 821 * - _dataEnd - End of currently valid data (end of last read). 822 * - _lineStart - Start of current line. 823 * - _lineEnd - End of current line. 824 */ 825 private File _file; 826 private ubyte[] _buffer; 827 private size_t _lineStart = 0; 828 private size_t _lineEnd = 0; 829 private size_t _dataEnd = 0; 830 831 this (File f) 832 { 833 _file = f; 834 _buffer = new ubyte[readSize + growSize]; 835 } 836 837 bool empty() const pure 838 { 839 return _file.eof && _lineStart == _dataEnd; 840 } 841 842 Char[] front() pure 843 { 844 assert(!empty, "Attempt to take the front of an empty bufferedByLine."); 845 846 static if (keepTerminator == Yes.keepTerminator) 847 { 848 return cast(Char[]) _buffer[_lineStart .. _lineEnd]; 849 } 850 else 851 { 852 assert(_lineStart < _lineEnd); 853 immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd; 854 return cast(Char[]) _buffer[_lineStart .. end]; 855 } 856 } 857 858 /* Note: Call popFront at initialization to do the initial read. */ 859 void popFront() 860 { 861 import std.algorithm: copy, find; 862 assert(!empty, "Attempt to popFront an empty bufferedByLine."); 863 864 /* Pop the current line. */ 865 _lineStart = _lineEnd; 866 867 /* Set up the next line if more data is available, either in the buffer or 868 * the file. The next line ends at the next newline, if there is one. 869 * 870 * Notes: 871 * - 'find' returns the slice starting with the character searched for, or 872 * an empty range if not found. 873 * - _lineEnd is set to _dataEnd both when the current buffer does not have 874 * a newline and when it ends with one. 875 */ 876 auto found = _buffer[_lineStart .. _dataEnd].find(terminator); 877 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 878 879 if (found.empty && !_file.eof) 880 { 881 /* No newline in current buffer. Read from the file until the next 882 * newline is found. 883 */ 884 assert(_lineEnd == _dataEnd); 885 886 if (_lineStart > 0) 887 { 888 /* Move remaining data to the start of the buffer. */ 889 immutable remainingLength = _dataEnd - _lineStart; 890 copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]); 891 _lineStart = 0; 892 _lineEnd = _dataEnd = remainingLength; 893 } 894 895 do 896 { 897 /* Grow the buffer if necessary. */ 898 immutable availableSize = _buffer.length - _dataEnd; 899 if (availableSize < readSize) 900 { 901 size_t growBy = growSize; 902 while (availableSize + growBy < readSize) growBy += growSize; 903 _buffer.length += growBy; 904 } 905 906 /* Read the next block. */ 907 _dataEnd += 908 _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize]) 909 .length; 910 911 found = _buffer[_lineEnd .. _dataEnd].find(terminator); 912 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 913 914 } while (found.empty && !_file.eof); 915 } 916 } 917 } 918 919 assert(file.isOpen, "bufferedByLine passed a closed file."); 920 921 auto r = new BufferedByLineImpl(file); 922 if (!r.empty) r.popFront; 923 return r; 924 } 925 926 // BufferedByLine. 927 unittest 928 { 929 import std.array : appender; 930 import std.conv : to; 931 import std.file : rmdirRecurse, readText; 932 import std.path : buildPath; 933 import std.range : lockstep; 934 import std.stdio; 935 import tsv_utils.common.unittest_utils; 936 937 auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline"); 938 scope(exit) testDir.rmdirRecurse; 939 940 /* Create two data files with the same data. Read both in parallel with byLine and 941 * bufferedByLine and compare each line. 942 */ 943 auto data1 = appender!(char[])(); 944 945 foreach (i; 1 .. 1001) data1.put('\n'); 946 foreach (i; 1 .. 1001) data1.put("a\n"); 947 foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); } 948 foreach (i; 1 .. 1001) 949 { 950 foreach (j; 1 .. i+1) data1.put('x'); 951 data1.put('\n'); 952 } 953 954 string file1a = buildPath(testDir, "file1a.txt"); 955 string file1b = buildPath(testDir, "file1b.txt"); 956 { 957 958 file1a.File("w").write(data1.data); 959 file1b.File("w").write(data1.data); 960 } 961 962 /* Default parameters. */ 963 { 964 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator); 965 auto f1bIn = file1b.File().byLine(No.keepTerminator); 966 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 967 } 968 { 969 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator); 970 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 971 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 972 } 973 974 /* Smaller read size. This will trigger buffer growth. */ 975 { 976 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', 512, 256); 977 auto f1bIn = file1b.File().byLine(No.keepTerminator); 978 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 979 } 980 981 /* Exercise boundary cases in buffer growth. 982 * Note: static-foreach requires DMD 2.076 / LDC 1.6 983 */ 984 static foreach (readSize; [1, 2, 4]) 985 { 986 static foreach (growSize; 1 .. readSize + 1) 987 {{ 988 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 989 auto f1bIn = file1b.File().byLine(No.keepTerminator); 990 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 991 }} 992 static foreach (growSize; 1 .. readSize + 1) 993 {{ 994 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 995 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 996 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 997 }} 998 } 999 1000 1001 /* Files that do not end in a newline. */ 1002 1003 string file2a = buildPath(testDir, "file2a.txt"); 1004 string file2b = buildPath(testDir, "file2b.txt"); 1005 string file3a = buildPath(testDir, "file3a.txt"); 1006 string file3b = buildPath(testDir, "file3b.txt"); 1007 string file4a = buildPath(testDir, "file4a.txt"); 1008 string file4b = buildPath(testDir, "file4b.txt"); 1009 { 1010 file1a.File("w").write("a"); 1011 file1b.File("w").write("a"); 1012 file2a.File("w").write("ab"); 1013 file2b.File("w").write("ab"); 1014 file3a.File("w").write("abc"); 1015 file3b.File("w").write("abc"); 1016 } 1017 1018 static foreach (readSize; [1, 2, 4]) 1019 { 1020 static foreach (growSize; 1 .. readSize + 1) 1021 {{ 1022 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1023 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1024 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1025 1026 auto f2aIn = file2a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1027 auto f2bIn = file2b.File().byLine(No.keepTerminator); 1028 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1029 1030 auto f3aIn = file3a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1031 auto f3bIn = file3b.File().byLine(No.keepTerminator); 1032 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1033 }} 1034 static foreach (growSize; 1 .. readSize + 1) 1035 {{ 1036 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1037 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1038 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1039 1040 auto f2aIn = file2a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1041 auto f2bIn = file2b.File().byLine(Yes.keepTerminator); 1042 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1043 1044 auto f3aIn = file3a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1045 auto f3bIn = file3b.File().byLine(Yes.keepTerminator); 1046 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1047 }} 1048 } 1049 } 1050 1051 /** 1052 joinAppend performs a join operation on an input range, appending the results to 1053 an output range. 1054 1055 joinAppend was written as a performance enhancement over using std.algorithm.joiner 1056 or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower 1057 than std.array.join with writeln. The joiner performance may be due to interaction 1058 with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter 1059 is better, but still substantially slower than join. Using join works reasonably well, 1060 but is allocating memory unnecessarily. 1061 1062 Using joinAppend with Appender is a bit faster than join, and allocates less memory. 1063 The Appender re-uses the underlying data buffer, saving memory. The example below 1064 illustrates. It is a modification of the InputFieldReordering example. The role 1065 Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange 1066 uses a similar technique to buffer multiple lines. 1067 1068 Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has 1069 its own joinAppend method. However, joinAppend remains useful when constructing internal 1070 buffers where BufferedOutputRange is not appropriate. 1071 1072 --- 1073 int main(string[] args) 1074 { 1075 import tsvutil; 1076 import std.algorithm, std.array, std.range, std.stdio; 1077 size_t[] fieldIndicies = [3, 0, 2]; 1078 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 1079 auto outputBuffer = appender!(char[]); 1080 foreach (line; stdin.byLine) 1081 { 1082 fieldReordering.initNewLine; 1083 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 1084 { 1085 fieldReordering.processNextField(fieldIndex, fieldValue); 1086 if (fieldReordering.allFieldsFilled) break; 1087 } 1088 if (fieldReordering.allFieldsFilled) 1089 { 1090 outputBuffer.clear; 1091 writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t'))); 1092 } 1093 else 1094 { 1095 writeln("Error: Insufficient number of field on the line."); 1096 } 1097 } 1098 return 0; 1099 } 1100 --- 1101 */ 1102 OutputRange joinAppend(InputRange, OutputRange, E) 1103 (InputRange inputRange, ref OutputRange outputRange, E delimiter) 1104 if (isInputRange!InputRange && 1105 (is(ElementType!InputRange : const E[]) && 1106 isOutputRange!(OutputRange, E[])) 1107 || 1108 (is(ElementType!InputRange : const E) && 1109 isOutputRange!(OutputRange, E)) 1110 ) 1111 { 1112 if (!inputRange.empty) 1113 { 1114 outputRange.put(inputRange.front); 1115 inputRange.popFront; 1116 } 1117 foreach (x; inputRange) 1118 { 1119 outputRange.put(delimiter); 1120 outputRange.put(x); 1121 } 1122 return outputRange; 1123 } 1124 1125 // joinAppend. 1126 @safe unittest 1127 { 1128 import std.array : appender; 1129 import std.algorithm : equal; 1130 1131 char[] c1 = ['a', 'b', 'c']; 1132 char[] c2 = ['d', 'e', 'f']; 1133 char[] c3 = ['g', 'h', 'i']; 1134 auto cvec = [c1, c2, c3]; 1135 1136 auto s1 = "abc"; 1137 auto s2 = "def"; 1138 auto s3 = "ghi"; 1139 auto svec = [s1, s2, s3]; 1140 1141 auto charAppender = appender!(char[])(); 1142 1143 assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi"); 1144 assert(equal(cvec, [c1, c2, c3])); 1145 1146 charAppender.put('$'); 1147 assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi"); 1148 assert(equal(cvec, [s1, s2, s3])); 1149 1150 charAppender.clear; 1151 assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi"); 1152 1153 auto intAppender = appender!(int[])(); 1154 1155 auto i1 = [100, 101, 102]; 1156 auto i2 = [200, 201, 202]; 1157 auto i3 = [300, 301, 302]; 1158 auto ivec = [i1, i2, i3]; 1159 1160 assert(ivec.joinAppend(intAppender, 0).data == 1161 [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]); 1162 1163 intAppender.clear; 1164 assert(i1.joinAppend(intAppender, 0).data == 1165 [100, 0, 101, 0, 102]); 1166 assert(i2.joinAppend(intAppender, 1).data == 1167 [100, 0, 101, 0, 102, 1168 200, 1, 201, 1, 202]); 1169 assert(i3.joinAppend(intAppender, 2).data == 1170 [100, 0, 101, 0, 102, 1171 200, 1, 201, 1, 202, 1172 300, 2, 301, 2, 302]); 1173 } 1174 1175 /** 1176 getTsvFieldValue extracts the value of a single field from a delimited text string. 1177 1178 This is a convenience function intended for cases when only a single field from an 1179 input line is needed. If multiple values are needed, it will be more efficient to 1180 work directly with std.algorithm.splitter or the InputFieldReordering class. 1181 1182 The input text is split by a delimiter character. The specified field is converted 1183 to the desired type and the value returned. 1184 1185 An exception is thrown if there are not enough fields on the line or if conversion 1186 fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on 1187 failure. If not enough fields, the exception text is generated referencing 1-upped 1188 field numbers as would be provided by command line users. 1189 */ 1190 T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim) 1191 if (isSomeChar!C) 1192 { 1193 import std.algorithm : splitter; 1194 import std.conv : to; 1195 import std.format : format; 1196 import std.range; 1197 1198 auto splitLine = line.splitter(delim); 1199 size_t atField = 0; 1200 1201 while (atField < fieldIndex && !splitLine.empty) 1202 { 1203 splitLine.popFront; 1204 atField++; 1205 } 1206 1207 T val; 1208 if (splitLine.empty) 1209 { 1210 if (fieldIndex == 0) 1211 { 1212 /* This is a workaround to a splitter special case - If the input is empty, 1213 * the returned split range is empty. This doesn't properly represent a single 1214 * column file. More correct mathematically, and for this case, would be a 1215 * single value representing an empty string. The input line is a convenient 1216 * source of an empty line. Info: 1217 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1218 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1219 */ 1220 assert(line.empty); 1221 val = line.to!T; 1222 } 1223 else 1224 { 1225 throw new Exception( 1226 format("Not enough fields on line. Number required: %d; Number found: %d", 1227 fieldIndex + 1, atField)); 1228 } 1229 } 1230 else 1231 { 1232 val = splitLine.front.to!T; 1233 } 1234 1235 return val; 1236 } 1237 1238 // getTsvFieldValue. 1239 @safe unittest 1240 { 1241 import std.conv : ConvException, to; 1242 import std.exception; 1243 1244 /* Common cases. */ 1245 assert(getTsvFieldValue!double("123", 0, '\t') == 123.0); 1246 assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5); 1247 assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123); 1248 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1249 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1250 assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99"); 1251 assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红"); 1252 assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅"); 1253 assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑"); 1254 assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5); 1255 1256 /* The empty field cases. */ 1257 assert(getTsvFieldValue!string("", 0, '\t') == ""); 1258 assert(getTsvFieldValue!string("\t", 0, '\t') == ""); 1259 assert(getTsvFieldValue!string("\t", 1, '\t') == ""); 1260 assert(getTsvFieldValue!string("", 0, ':') == ""); 1261 assert(getTsvFieldValue!string(":", 0, ':') == ""); 1262 assert(getTsvFieldValue!string(":", 1, ':') == ""); 1263 1264 /* Tests with different data types. */ 1265 string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5"; 1266 char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]); 1267 dchar[] dcharLine = stringLine.to!(dchar[]); 1268 wchar[] wcharLine = stringLine.to!(wchar[]); 1269 1270 assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black"); 1271 assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1272 assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1273 assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5); 1274 1275 assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black"); 1276 assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1277 assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1278 assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5); 1279 1280 assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black"); 1281 assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1282 assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1283 assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5); 1284 1285 assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black"); 1286 assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1287 assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1288 assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5); 1289 1290 /* Conversion errors. */ 1291 assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t')); 1292 assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|')); 1293 assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|')); 1294 assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|')); 1295 assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|')); 1296 1297 /* Not enough field errors. These should throw, but not a ConvException.*/ 1298 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t'))); 1299 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t'))); 1300 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t'))); 1301 } 1302 1303 /** 1304 Field-lists - A field-list is a string entered on the command line identifying one or more 1305 field numbers. They are used by the majority of the tsv utility applications. There are 1306 two helper functions, makeFieldListOptionHandler and parseFieldList. Most applications 1307 will use makeFieldListOptionHandler, it creates a delegate that can be passed to 1308 std.getopt to process the command option. Actual processing of the option text is done by 1309 parseFieldList. It can be called directly when the text of the option value contains more 1310 than just the field number. 1311 1312 Syntax and behavior: 1313 1314 A 'field-list' is a list of numeric field numbers entered on the command line. Fields are 1315 1-upped integers representing locations in an input line, in the traditional meaning of 1316 Unix command line tools. Fields can be entered as single numbers or a range. Multiple 1317 entries are separated by commas. Some examples (with 'fields' as the command line option): 1318 1319 --fields 3 // Single field 1320 --fields 4,1 // Two fields 1321 --fields 3-9 // A range, fields 3 to 9 inclusive 1322 --fields 1,2,7-34,11 // A mix of ranges and fields 1323 --fields 15-5,3-1 // Two ranges in reverse order. 1324 1325 Incomplete ranges are not supported, for example, '6-'. Zero is disallowed as a field 1326 value by default, but can be enabled to support the notion of zero as representing the 1327 entire line. However, zero cannot be part of a range. Field numbers are one-based by 1328 default, but can be converted to zero-based. If conversion to zero-based is enabled, field 1329 number zero must be disallowed or a signed integer type specified for the returned range. 1330 1331 An error is thrown if an invalid field specification is encountered. Error text is 1332 intended for display. Error conditions include: 1333 - Empty fields list 1334 - Empty value, e.g. Two consequtive commas, a trailing comma, or a leading comma 1335 - String that does not parse as a valid integer 1336 - Negative integers, or zero if zero is disallowed. 1337 - An incomplete range 1338 - Zero used as part of a range. 1339 1340 No other behaviors are enforced. Repeated values are accepted. If zero is allowed, other 1341 field numbers can be entered as well. Additional restrictions need to be applied by the 1342 caller. 1343 1344 Notes: 1345 - The data type determines the max field number that can be entered. Enabling conversion 1346 to zero restricts to the signed version of the data type. 1347 - Use 'import std.typecons : Yes, No' to use the convertToZeroBasedIndex and 1348 allowFieldNumZero template parameters. 1349 */ 1350 1351 /** [Yes|No].convertToZeroBasedIndex parameter controls whether field numbers are 1352 * converted to zero-based indices by makeFieldListOptionHander and parseFieldList. 1353 */ 1354 alias ConvertToZeroBasedIndex = Flag!"convertToZeroBasedIndex"; 1355 1356 /** [Yes|No].allowFieldNumZero parameter controls whether zero is a valid field. This is 1357 * used by makeFieldListOptionHander and parseFieldList. 1358 */ 1359 alias AllowFieldNumZero = Flag!"allowFieldNumZero"; 1360 1361 alias OptionHandlerDelegate = void delegate(string option, string value); 1362 1363 /** 1364 makeFieldListOptionHandler creates a std.getopt option hander for processing field lists 1365 entered on the command line. A field list is as defined by parseFieldList. 1366 */ 1367 OptionHandlerDelegate makeFieldListOptionHandler( 1368 T, 1369 ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, 1370 AllowFieldNumZero allowZero = No.allowFieldNumZero) 1371 (ref T[] fieldsArray) 1372 if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) 1373 { 1374 void fieldListOptionHandler(ref T[] fieldArray, string option, string value) pure @safe 1375 { 1376 import std.algorithm : each; 1377 try value.parseFieldList!(T, convertToZero, allowZero).each!(x => fieldArray ~= x); 1378 catch (Exception exc) 1379 { 1380 import std.format : format; 1381 exc.msg = format("[--%s] %s", option, exc.msg); 1382 throw exc; 1383 } 1384 } 1385 1386 return (option, value) => fieldListOptionHandler(fieldsArray, option, value); 1387 } 1388 1389 // makeFieldListOptionHandler. 1390 unittest 1391 { 1392 import std.exception : assertThrown, assertNotThrown; 1393 import std.getopt; 1394 1395 { 1396 size_t[] fields; 1397 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1398 getopt(args, "f|fields", fields.makeFieldListOptionHandler); 1399 assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); 1400 } 1401 { 1402 size_t[] fields; 1403 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1404 getopt(args, 1405 "f|fields", fields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex)); 1406 assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); 1407 } 1408 { 1409 size_t[] fields; 1410 auto args = ["program", "-f", "0"]; 1411 getopt(args, 1412 "f|fields", fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1413 assert(fields == [0]); 1414 } 1415 { 1416 size_t[] fields; 1417 auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1"]; 1418 getopt(args, 1419 "f|fields", fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1420 assert(fields == [0, 1, 0, 0, 1]); 1421 } 1422 { 1423 size_t[] ints; 1424 size_t[] fields; 1425 auto args = ["program", "--ints", "1,2,3", "--fields", "1", "--ints", "4,5,6", "--fields", "2,4,7-9,23-21"]; 1426 std.getopt.arraySep = ","; 1427 getopt(args, 1428 "i|ints", "Built-in list of integers.", &ints, 1429 "f|fields", "Field-list style integers.", fields.makeFieldListOptionHandler); 1430 assert(ints == [1, 2, 3, 4, 5, 6]); 1431 assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); 1432 } 1433 1434 /* Basic cases involved unsinged types smaller than size_t. */ 1435 { 1436 uint[] fields; 1437 auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1", "-f", "55-58"]; 1438 getopt(args, 1439 "f|fields", fields.makeFieldListOptionHandler!(uint, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1440 assert(fields == [0, 1, 0, 0, 1, 55, 56, 57, 58]); 1441 } 1442 { 1443 ushort[] fields; 1444 auto args = ["program", "-f", "0", "-f", "1,0", "-f", "0,1", "-f", "55-58"]; 1445 getopt(args, 1446 "f|fields", fields.makeFieldListOptionHandler!(ushort, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1447 assert(fields == [0, 1, 0, 0, 1, 55, 56, 57, 58]); 1448 } 1449 1450 /* Basic cases involving unsigned types. */ 1451 { 1452 long[] fields; 1453 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1454 getopt(args, "f|fields", fields.makeFieldListOptionHandler); 1455 assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); 1456 } 1457 { 1458 long[] fields; 1459 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1460 getopt(args, 1461 "f|fields", fields.makeFieldListOptionHandler!(long, Yes.convertToZeroBasedIndex)); 1462 assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); 1463 } 1464 { 1465 long[] fields; 1466 auto args = ["program", "-f", "0"]; 1467 getopt(args, 1468 "f|fields", fields.makeFieldListOptionHandler!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1469 assert(fields == [-1]); 1470 } 1471 { 1472 int[] fields; 1473 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1474 getopt(args, "f|fields", fields.makeFieldListOptionHandler); 1475 assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); 1476 } 1477 { 1478 int[] fields; 1479 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1480 getopt(args, 1481 "f|fields", fields.makeFieldListOptionHandler!(int, Yes.convertToZeroBasedIndex)); 1482 assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); 1483 } 1484 { 1485 int[] fields; 1486 auto args = ["program", "-f", "0"]; 1487 getopt(args, 1488 "f|fields", fields.makeFieldListOptionHandler!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1489 assert(fields == [-1]); 1490 } 1491 { 1492 short[] fields; 1493 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1494 getopt(args, "f|fields", fields.makeFieldListOptionHandler); 1495 assert(fields == [1, 2, 4, 7, 8, 9, 23, 22, 21]); 1496 } 1497 { 1498 short[] fields; 1499 auto args = ["program", "--fields", "1", "--fields", "2,4,7-9,23-21"]; 1500 getopt(args, 1501 "f|fields", fields.makeFieldListOptionHandler!(short, Yes.convertToZeroBasedIndex)); 1502 assert(fields == [0, 1, 3, 6, 7, 8, 22, 21, 20]); 1503 } 1504 { 1505 short[] fields; 1506 auto args = ["program", "-f", "0"]; 1507 getopt(args, 1508 "f|fields", fields.makeFieldListOptionHandler!(short, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1509 assert(fields == [-1]); 1510 } 1511 1512 { 1513 /* Error cases. */ 1514 size_t[] fields; 1515 auto args = ["program", "-f", "0"]; 1516 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1517 1518 args = ["program", "-f", "-1"]; 1519 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1520 1521 args = ["program", "-f", "--fields", "1"]; 1522 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1523 1524 args = ["program", "-f", "a"]; 1525 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1526 1527 args = ["program", "-f", "1.5"]; 1528 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1529 1530 args = ["program", "-f", "2-"]; 1531 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1532 1533 args = ["program", "-f", "3,5,-7"]; 1534 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1535 1536 args = ["program", "-f", "3,5,"]; 1537 assertThrown(getopt(args, "f|fields", fields.makeFieldListOptionHandler)); 1538 1539 args = ["program", "-f", "-1"]; 1540 assertThrown(getopt(args, 1541 "f|fields", fields.makeFieldListOptionHandler!( 1542 size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero))); 1543 } 1544 } 1545 1546 /** 1547 parseFieldList lazily generates a range of fields numbers from a 'field-list' string. 1548 */ 1549 auto parseFieldList(T = size_t, 1550 ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, 1551 AllowFieldNumZero allowZero = No.allowFieldNumZero) 1552 (string fieldList, char delim = ',') 1553 if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) 1554 { 1555 import std.algorithm : splitter; 1556 1557 auto _splitFieldList = fieldList.splitter(delim); 1558 auto _currFieldParse = 1559 (_splitFieldList.empty ? "" : _splitFieldList.front) 1560 .parseFieldRange!(T, convertToZero, allowZero); 1561 1562 if (!_splitFieldList.empty) _splitFieldList.popFront; 1563 1564 struct Result 1565 { 1566 @property bool empty() pure nothrow @safe @nogc 1567 { 1568 return _currFieldParse.empty; 1569 } 1570 1571 @property T front() pure @safe 1572 { 1573 import std.conv : to; 1574 1575 assert(!empty, "Attempting to fetch the front of an empty field-list."); 1576 assert(!_currFieldParse.empty, "Internal error. Call to front with an empty _currFieldParse."); 1577 1578 return _currFieldParse.front.to!T; 1579 } 1580 1581 void popFront() pure @safe 1582 { 1583 assert(!empty, "Attempting to popFront an empty field-list."); 1584 1585 _currFieldParse.popFront; 1586 if (_currFieldParse.empty && !_splitFieldList.empty) 1587 { 1588 _currFieldParse = _splitFieldList.front.parseFieldRange!(T, convertToZero, allowZero); 1589 _splitFieldList.popFront; 1590 } 1591 } 1592 } 1593 1594 return Result(); 1595 } 1596 1597 // parseFieldList. 1598 @safe unittest 1599 { 1600 import std.algorithm : each, equal; 1601 import std.exception : assertThrown, assertNotThrown; 1602 1603 /* Basic tests. */ 1604 assert("1".parseFieldList.equal([1])); 1605 assert("1,2".parseFieldList.equal([1, 2])); 1606 assert("1,2,3".parseFieldList.equal([1, 2, 3])); 1607 assert("1-2".parseFieldList.equal([1, 2])); 1608 assert("1-2,6-4".parseFieldList.equal([1, 2, 6, 5, 4])); 1609 assert("1-2,1,1-2,2,2-1".parseFieldList.equal([1, 2, 1, 1, 2, 2, 2, 1])); 1610 assert("1-2,5".parseFieldList!size_t.equal([1, 2, 5])); 1611 1612 /* Signed Int tests */ 1613 assert("1".parseFieldList!int.equal([1])); 1614 assert("1,2,3".parseFieldList!int.equal([1, 2, 3])); 1615 assert("1-2".parseFieldList!int.equal([1, 2])); 1616 assert("1-2,6-4".parseFieldList!int.equal([1, 2, 6, 5, 4])); 1617 assert("1-2,5".parseFieldList!int.equal([1, 2, 5])); 1618 1619 /* Convert to zero tests */ 1620 assert("1".parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0])); 1621 assert("1,2,3".parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 1622 assert("1-2".parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1])); 1623 assert("1-2,6-4".parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 1624 assert("1-2,5".parseFieldList!(size_t, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); 1625 1626 assert("1".parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0])); 1627 assert("1,2,3".parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 2])); 1628 assert("1-2".parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1])); 1629 assert("1-2,6-4".parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 5, 4, 3])); 1630 assert("1-2,5".parseFieldList!(long, Yes.convertToZeroBasedIndex).equal([0, 1, 4])); 1631 1632 /* Allow zero tests. */ 1633 assert("0".parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1634 assert("1,0,3".parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 1635 assert("1-2,5".parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 1636 assert("0".parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1637 assert("1,0,3".parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 0, 3])); 1638 assert("1-2,5".parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([1, 2, 5])); 1639 assert("0".parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 1640 assert("1,0,3".parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, -1, 2])); 1641 assert("1-2,5".parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0, 1, 4])); 1642 1643 /* Error cases. */ 1644 assertThrown("".parseFieldList.each); 1645 assertThrown(" ".parseFieldList.each); 1646 assertThrown(",".parseFieldList.each); 1647 assertThrown("5 6".parseFieldList.each); 1648 assertThrown(",7".parseFieldList.each); 1649 assertThrown("8,".parseFieldList.each); 1650 assertThrown("8,9,".parseFieldList.each); 1651 assertThrown("10,,11".parseFieldList.each); 1652 assertThrown("".parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 1653 assertThrown("1,2-3,".parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 1654 assertThrown("2-,4".parseFieldList!(long, Yes.convertToZeroBasedIndex).each); 1655 assertThrown("1,2,3,,4".parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1656 assertThrown(",7".parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1657 assertThrown("8,".parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1658 assertThrown("10,0,,11".parseFieldList!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1659 assertThrown("8,9,".parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1660 1661 assertThrown("0".parseFieldList.each); 1662 assertThrown("1,0,3".parseFieldList.each); 1663 assertThrown("0".parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 1664 assertThrown("1,0,3".parseFieldList!(int, Yes.convertToZeroBasedIndex, No.allowFieldNumZero).each); 1665 assertThrown("0-2,6-0".parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1666 assertThrown("0-2,6-0".parseFieldList!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1667 assertThrown("0-2,6-0".parseFieldList!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).each); 1668 } 1669 1670 /* parseFieldRange parses a single number or number range. E.g. '5' or '5-8'. These are 1671 * the values in a field-list separated by a comma or other delimiter. It returns a range 1672 * that iterates over all the values in the range. 1673 */ 1674 private auto parseFieldRange(T = size_t, 1675 ConvertToZeroBasedIndex convertToZero = No.convertToZeroBasedIndex, 1676 AllowFieldNumZero allowZero = No.allowFieldNumZero) 1677 (string fieldRange) 1678 if (isIntegral!T && (!allowZero || !convertToZero || !isUnsigned!T)) 1679 { 1680 import std.algorithm : findSplit; 1681 import std.conv : to; 1682 import std.exception : enforce; 1683 import std.format : format; 1684 import std.range : iota; 1685 import std.traits : Signed; 1686 1687 /* Pick the largest compatible integral type for the IOTA range. This must be the 1688 * signed type if convertToZero is true, as a reverse order range may end at -1. 1689 */ 1690 static if (convertToZero) alias S = Signed!T; 1691 else alias S = T; 1692 1693 enforce(fieldRange.length != 0, "Empty field number."); 1694 1695 auto rangeSplit = findSplit(fieldRange, "-"); 1696 1697 /* Make sure the range does not start or end with a dash. */ 1698 enforce(rangeSplit[1].empty || (!rangeSplit[0].empty && !rangeSplit[2].empty), 1699 format("Incomplete ranges are not supported: '%s'", fieldRange)); 1700 1701 S start = rangeSplit[0].to!S; 1702 S last = rangeSplit[1].empty ? start : rangeSplit[2].to!S; 1703 Signed!T increment = (start <= last) ? 1 : -1; 1704 1705 static if (allowZero) 1706 { 1707 enforce(rangeSplit[1].empty || (start != 0 && last != 0), 1708 format("Zero cannot be used as part of a range: '%s'", fieldRange)); 1709 } 1710 1711 static if (allowZero) 1712 { 1713 enforce(start >= 0 && last >= 0, 1714 format("Field numbers must be non-negative integers: '%d'", 1715 (start < 0) ? start : last)); 1716 } 1717 else 1718 { 1719 enforce(start >= 1 && last >= 1, 1720 format("Field numbers must be greater than zero: '%d'", 1721 (start < 1) ? start : last)); 1722 } 1723 1724 static if (convertToZero) 1725 { 1726 start--; 1727 last--; 1728 } 1729 1730 return iota(start, last + increment, increment); 1731 } 1732 1733 // parseFieldRange. 1734 @safe unittest 1735 { 1736 import std.algorithm : equal; 1737 import std.exception : assertThrown, assertNotThrown; 1738 1739 /* Basic cases */ 1740 assert(parseFieldRange("1").equal([1])); 1741 assert("2".parseFieldRange.equal([2])); 1742 assert("3-4".parseFieldRange.equal([3, 4])); 1743 assert("3-5".parseFieldRange.equal([3, 4, 5])); 1744 assert("4-3".parseFieldRange.equal([4, 3])); 1745 assert("10-1".parseFieldRange.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1746 1747 /* Convert to zero-based indices */ 1748 assert(parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)("1").equal([0])); 1749 assert("2".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex).equal([1])); 1750 assert("3-4".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex).equal([2, 3])); 1751 assert("3-5".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex).equal([2, 3, 4])); 1752 assert("4-3".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex).equal([3, 2])); 1753 assert("10-1".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex).equal([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])); 1754 1755 /* Allow zero. */ 1756 assert("0".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1757 assert(parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)("1").equal([1])); 1758 assert("3-4".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([3, 4])); 1759 assert("10-1".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1760 1761 /* Allow zero, convert to zero-based index. */ 1762 assert("0".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 1763 assert(parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)("1").equal([0])); 1764 assert("3-4".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([2, 3])); 1765 assert("10-1".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])); 1766 1767 /* Alternate integer types. */ 1768 assert("2".parseFieldRange!uint.equal([2])); 1769 assert("3-5".parseFieldRange!uint.equal([3, 4, 5])); 1770 assert("10-1".parseFieldRange!uint.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1771 assert("2".parseFieldRange!int.equal([2])); 1772 assert("3-5".parseFieldRange!int.equal([3, 4, 5])); 1773 assert("10-1".parseFieldRange!int.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1774 assert("2".parseFieldRange!ushort.equal([2])); 1775 assert("3-5".parseFieldRange!ushort.equal([3, 4, 5])); 1776 assert("10-1".parseFieldRange!ushort.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1777 assert("2".parseFieldRange!short.equal([2])); 1778 assert("3-5".parseFieldRange!short.equal([3, 4, 5])); 1779 assert("10-1".parseFieldRange!short.equal([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])); 1780 1781 assert("0".parseFieldRange!(long, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1782 assert("0".parseFieldRange!(uint, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1783 assert("0".parseFieldRange!(int, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1784 assert("0".parseFieldRange!(ushort, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1785 assert("0".parseFieldRange!(short, No.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([0])); 1786 assert("0".parseFieldRange!(int, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 1787 assert("0".parseFieldRange!(short, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero).equal([-1])); 1788 1789 /* Max field value cases. */ 1790 assert("65535".parseFieldRange!ushort.equal([65535])); // ushort max 1791 assert("65533-65535".parseFieldRange!ushort.equal([65533, 65534, 65535])); 1792 assert("32767".parseFieldRange!short.equal([32767])); // short max 1793 assert("32765-32767".parseFieldRange!short.equal([32765, 32766, 32767])); 1794 assert("32767".parseFieldRange!(short, Yes.convertToZeroBasedIndex).equal([32766])); 1795 1796 /* Error cases. */ 1797 assertThrown("".parseFieldRange); 1798 assertThrown(" ".parseFieldRange); 1799 assertThrown("-".parseFieldRange); 1800 assertThrown(" -".parseFieldRange); 1801 assertThrown("- ".parseFieldRange); 1802 assertThrown("1-".parseFieldRange); 1803 assertThrown("-2".parseFieldRange); 1804 assertThrown("-1".parseFieldRange); 1805 assertThrown("1.0".parseFieldRange); 1806 assertThrown("0".parseFieldRange); 1807 assertThrown("0-3".parseFieldRange); 1808 assertThrown("3-0".parseFieldRange); 1809 assertThrown("-2-4".parseFieldRange); 1810 assertThrown("2--4".parseFieldRange); 1811 assertThrown("2-".parseFieldRange); 1812 assertThrown("a".parseFieldRange); 1813 assertThrown("0x3".parseFieldRange); 1814 assertThrown("3U".parseFieldRange); 1815 assertThrown("1_000".parseFieldRange); 1816 assertThrown(".".parseFieldRange); 1817 1818 assertThrown("".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1819 assertThrown(" ".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1820 assertThrown("-".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1821 assertThrown("1-".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1822 assertThrown("-2".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1823 assertThrown("-1".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1824 assertThrown("0".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1825 assertThrown("0-3".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1826 assertThrown("3-0".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1827 assertThrown("-2-4".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1828 assertThrown("2--4".parseFieldRange!(size_t, Yes.convertToZeroBasedIndex)); 1829 1830 assertThrown("".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1831 assertThrown(" ".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1832 assertThrown("-".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1833 assertThrown("1-".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1834 assertThrown("-2".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1835 assertThrown("-1".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1836 assertThrown("0-3".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1837 assertThrown("3-0".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1838 assertThrown("-2-4".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1839 assertThrown("2--4".parseFieldRange!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1840 1841 assertThrown("".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1842 assertThrown(" ".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1843 assertThrown("-".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1844 assertThrown("1-".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1845 assertThrown("-2".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1846 assertThrown("-1".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1847 assertThrown("0-3".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1848 assertThrown("3-0".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1849 assertThrown("-2-4".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1850 assertThrown("2--4".parseFieldRange!(long, Yes.convertToZeroBasedIndex, Yes.allowFieldNumZero)); 1851 1852 /* Value out of range cases. */ 1853 assertThrown("65536".parseFieldRange!ushort); // One more than ushort max. 1854 assertThrown("65535-65536".parseFieldRange!ushort); 1855 assertThrown("32768".parseFieldRange!short); // One more than short max. 1856 assertThrown("32765-32768".parseFieldRange!short); 1857 // Convert to zero limits signed range. 1858 assertThrown("32768".parseFieldRange!(ushort, Yes.convertToZeroBasedIndex)); 1859 assert("32767".parseFieldRange!(ushort, Yes.convertToZeroBasedIndex).equal([32766])); 1860 } 1861 1862 /** [Yes|No.newlineWasRemoved] is a template parameter to throwIfWindowsNewlineOnUnix. 1863 * A Yes value indicates the Unix newline was already removed, as might be done via 1864 * std.File.byLine or similar mechanism. 1865 */ 1866 alias NewlineWasRemoved = Flag!"newlineWasRemoved"; 1867 1868 /** 1869 throwIfWindowsLineNewlineOnUnix is used to throw an exception if a Windows/DOS 1870 line ending is found on a build compiled for a Unix platform. This is used by 1871 the TSV Utilities to detect Window/DOS line endings and terminate processing 1872 with an error message to the user. 1873 */ 1874 void throwIfWindowsNewlineOnUnix 1875 (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved) 1876 (const char[] line, const char[] filename, size_t lineNum) 1877 { 1878 version(Posix) 1879 { 1880 static if (nlWasRemoved) 1881 { 1882 immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r'; 1883 } 1884 else 1885 { 1886 immutable bool hasWindowsLineEnding = 1887 line.length > 1 && 1888 line[$ - 2] == '\r' && 1889 line[$ - 1] == '\n'; 1890 } 1891 1892 if (hasWindowsLineEnding) 1893 { 1894 import std.format; 1895 throw new Exception( 1896 format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n File: %s, Line: %s", 1897 (filename == "-") ? "Standard Input" : filename, lineNum)); 1898 } 1899 } 1900 } 1901 1902 // throwIfWindowsNewlineOnUnix 1903 @safe unittest 1904 { 1905 /* Note: Currently only building on Posix. Need to add non-Posix test cases 1906 * if Windows builds are ever done. 1907 */ 1908 version(Posix) 1909 { 1910 import std.exception; 1911 1912 assertNotThrown(throwIfWindowsNewlineOnUnix("", "afile.tsv", 1)); 1913 assertNotThrown(throwIfWindowsNewlineOnUnix("a", "afile.tsv", 2)); 1914 assertNotThrown(throwIfWindowsNewlineOnUnix("ab", "afile.tsv", 3)); 1915 assertNotThrown(throwIfWindowsNewlineOnUnix("abc", "afile.tsv", 4)); 1916 1917 assertThrown(throwIfWindowsNewlineOnUnix("\r", "afile.tsv", 1)); 1918 assertThrown(throwIfWindowsNewlineOnUnix("a\r", "afile.tsv", 2)); 1919 assertThrown(throwIfWindowsNewlineOnUnix("ab\r", "afile.tsv", 3)); 1920 assertThrown(throwIfWindowsNewlineOnUnix("abc\r", "afile.tsv", 4)); 1921 1922 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\n", "afile.tsv", 1)); 1923 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\n", "afile.tsv", 2)); 1924 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3)); 1925 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4)); 1926 1927 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5)); 1928 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6)); 1929 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7)); 1930 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8)); 1931 1932 /* Standard Input formatting. */ 1933 import std.algorithm : endsWith; 1934 bool exceptionCaught = false; 1935 1936 try (throwIfWindowsNewlineOnUnix("\r", "-", 99)); 1937 catch (Exception e) 1938 { 1939 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1940 exceptionCaught = true; 1941 } 1942 finally 1943 { 1944 assert(exceptionCaught); 1945 exceptionCaught = false; 1946 } 1947 1948 try (throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "-", 99)); 1949 catch (Exception e) 1950 { 1951 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1952 exceptionCaught = true; 1953 } 1954 finally 1955 { 1956 assert(exceptionCaught); 1957 exceptionCaught = false; 1958 } 1959 } 1960 } 1961 1962 /** Flag used by InputSourceRange to determine if the header line should be when 1963 opening a file. 1964 */ 1965 alias ReadHeader = Flag!"readHeader"; 1966 1967 /** 1968 inputSourceRange is a helper function for creating new InputSourceRange objects. 1969 */ 1970 InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader) 1971 { 1972 return new InputSourceRange(filepaths, readHeader); 1973 } 1974 1975 /** 1976 InputSourceRange is an input range that iterates over a set of input files. 1977 1978 InputSourceRange is used to iterate over a set of files passed on the command line. 1979 Files are automatically opened and closed during iteration. The caller can choose to 1980 have header lines read automatically. 1981 1982 The range is created from a set of filepaths. These filepaths are mapped to 1983 InputSource objects during the iteration. This is what enables automatically opening 1984 and closing files and reading the header line. 1985 1986 The motivation for an InputSourceRange is to provide a standard way to look at the 1987 header line of the first input file during command line argument processing, and then 1988 pass the open input file and the header line along to the main processing functions. 1989 This enables a features like named fields to be implemented in a standard way. 1990 1991 Both InputSourceRange and InputSource are reference objects. This keeps their use 1992 limited to a single iteration over the set of files. The files can be iterated again 1993 by creating a new InputSourceRange against the same filepaths. 1994 1995 Currently, InputSourceRange supports files and standard input. It is possible other 1996 types of input sources will be added in the future. 1997 */ 1998 final class InputSourceRange 1999 { 2000 import std.range; 2001 2002 private string[] _filepaths; 2003 private ReadHeader _readHeader; 2004 private InputSource _front; 2005 2006 this(string[] filepaths, ReadHeader readHeader) 2007 { 2008 _filepaths = filepaths.dup; 2009 _readHeader = readHeader; 2010 _front = null; 2011 2012 if (!_filepaths.empty) 2013 { 2014 _front = new InputSource(_filepaths.front, _readHeader); 2015 _front.open; 2016 _filepaths.popFront; 2017 } 2018 } 2019 2020 size_t length() const pure nothrow @safe 2021 { 2022 return empty ? 0 : _filepaths.length + 1; 2023 } 2024 2025 bool empty() const pure nothrow @safe 2026 { 2027 return _front is null; 2028 } 2029 2030 InputSource front() pure @safe 2031 { 2032 assert(!empty, "Attempt to take the front of an empty InputSourceRange"); 2033 return _front; 2034 } 2035 2036 void popFront() 2037 { 2038 assert(!empty, "Attempt to popFront an empty InputSourceRange"); 2039 2040 _front.close; 2041 2042 if (!_filepaths.empty) 2043 { 2044 _front = new InputSource(_filepaths.front, _readHeader); 2045 _front.open; 2046 _filepaths.popFront; 2047 } 2048 else 2049 { 2050 _front = null; 2051 } 2052 } 2053 } 2054 2055 /** 2056 InputSource is a class of objects produced by iterating over an InputSourceRange. 2057 2058 An InputSource object provides access to the open file currently the front element 2059 of an InputSourceRange. The main methods application code is likely to need are: 2060 2061 $(LIST 2062 * `file()` - Returns the File object. The file will be open for reading as long 2063 InputSource instance is the front element of the InputSourceRange it came from. 2064 2065 * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the 2066 header line from the file. An empty string is returned if InputSource range 2067 was created with readHeader=false. 2068 2069 * `name()` - The name of the input source. The name returned is intended for 2070 user error messages. For files, this is the filepath that was passed to 2071 InputSourceRange. For standard input, it is "Standard Input". 2072 ) 2073 2074 An InputSource is a reference object, so the copies will retain the state of the 2075 InputSourceRange front element. In particular, all copies will have the open 2076 state of the front element of the InputSourceRange. 2077 2078 This class is not intended for use outside the context of an InputSourceRange. 2079 */ 2080 final class InputSource 2081 { 2082 import std.range; 2083 import std.stdio; 2084 2085 private immutable string _filepath; 2086 private immutable bool _isStdin; 2087 private bool _isOpen; 2088 private ReadHeader _readHeader; 2089 private bool _hasBeenOpened; 2090 private string _header; 2091 private File _file; 2092 2093 private this(string filepath, ReadHeader readHeader) pure nothrow @safe 2094 { 2095 _filepath = filepath; 2096 _isStdin = filepath == "-"; 2097 _isOpen = false; 2098 _readHeader = readHeader; 2099 _hasBeenOpened = false; 2100 } 2101 2102 /** file returns the File object held by the InputSource. 2103 * 2104 * The File will be open for reading as long as the InputSource instance is the 2105 * front element of the InputSourceRange it came from. 2106 */ 2107 File file() nothrow @safe 2108 { 2109 return _file; 2110 } 2111 2112 /** isReadHeaderEnabled returns true if the header line is being read. 2113 */ 2114 bool isReadHeaderEnabled() const pure nothrow @safe 2115 { 2116 return _readHeader == Yes.readHeader; 2117 } 2118 2119 /** header returns the header line from the input file. 2120 * 2121 * An empty string is returned if InputSource range was created with 2122 * readHeader=false. 2123 */ 2124 string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe 2125 { 2126 assert(_hasBeenOpened); 2127 return (keepTerminator == Yes.keepTerminator || 2128 _header.length == 0 || 2129 _header[$ - 1] != '\n') ? 2130 _header : _header[0 .. $-1]; 2131 } 2132 2133 /** isHeaderEmpty returns true if there is no data for a header, including the 2134 * terminator. 2135 * 2136 * When headers are being read, this true only if the file is empty. 2137 */ 2138 bool isHeaderEmpty() const pure nothrow @safe 2139 { 2140 assert(_hasBeenOpened); 2141 return _header.empty; 2142 } 2143 2144 /** name returns a user friendly name representing the input source. 2145 * 2146 * For files, it is the filepath provided to InputSourceRange. For standard 2147 * input, it is "Standard Input". (Use isStdin() to test for standard input, 2148 * not name(). 2149 */ 2150 string name() const pure nothrow @safe 2151 { 2152 return _isStdin ? "Standard Input" : _filepath; 2153 } 2154 2155 /** isStdin returns true if the input source is Standard Input, false otherwise. 2156 */ 2157 bool isStdin() const pure nothrow @safe 2158 { 2159 return _isStdin; 2160 } 2161 2162 /** isOpen returns true if the input source is open for reading, false otherwise. 2163 * 2164 * "Open" in this context is whether the InputSource object is currently open, 2165 * meaning that it is the front element of the InputSourceRange that created it. 2166 * 2167 * For files, this is also reflected in the state of the underlying File object. 2168 * However, standard input is never actually closed. 2169 */ 2170 bool isOpen() const pure nothrow @safe 2171 { 2172 return _isOpen; 2173 } 2174 2175 private void open() 2176 { 2177 assert(!_isOpen); 2178 assert(!_hasBeenOpened); 2179 2180 _file = isStdin ? stdin : _filepath.File("rb"); 2181 if (_readHeader) _header = _file.readln; 2182 _isOpen = true; 2183 _hasBeenOpened = true; 2184 } 2185 2186 private void close() 2187 { 2188 if (!_isStdin) _file.close; 2189 _isOpen = false; 2190 } 2191 } 2192 2193 // InputSourceRange and InputSource 2194 unittest 2195 { 2196 import std.algorithm : all, each; 2197 import std.array : appender; 2198 import std.exception : assertThrown; 2199 import std.file : rmdirRecurse; 2200 import std.path : buildPath; 2201 import std.range; 2202 import std.stdio; 2203 import tsv_utils.common.unittest_utils; 2204 2205 auto testDir = makeUnittestTempDir("tsv_utils_input_source_range"); 2206 scope(exit) testDir.rmdirRecurse; 2207 2208 string file0 = buildPath(testDir, "file0.txt"); 2209 string file1 = buildPath(testDir, "file1.txt"); 2210 string file2 = buildPath(testDir, "file2.txt"); 2211 string file3 = buildPath(testDir, "file3.txt"); 2212 2213 string file0Header = ""; 2214 string file1Header = "file 1 header\n"; 2215 string file2Header = "file 2 header\n"; 2216 string file3Header = "file 3 header\n"; 2217 2218 string file0Body = ""; 2219 string file1Body = ""; 2220 string file2Body = "file 2 line 1\n"; 2221 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 2222 2223 string file0Data = file0Header ~ file0Body; 2224 string file1Data = file1Header ~ file1Body; 2225 string file2Data = file2Header ~ file2Body; 2226 string file3Data = file3Header ~ file3Body; 2227 2228 { 2229 file0.File("w").write(file0Data); 2230 file1.File("w").write(file1Data); 2231 file2.File("w").write(file2Data); 2232 file3.File("w").write(file3Data); 2233 } 2234 2235 auto inputFiles = [file0, file1, file2, file3]; 2236 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 2237 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 2238 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 2239 2240 auto readSources = appender!(InputSource[]); 2241 auto buffer = new char[1024]; // Must be large enough to hold the test files. 2242 2243 /* Tests without standard input. Don't want to count on state of standard 2244 * input or modifying it when doing unit tests, so avoid reading from it. 2245 */ 2246 2247 foreach(numFiles; 1 .. inputFiles.length + 1) 2248 { 2249 /* Reading headers. */ 2250 2251 readSources.clear; 2252 auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader); 2253 assert(inputSourcesYesHeader.length == numFiles); 2254 2255 foreach(fileNum, source; inputSourcesYesHeader.enumerate) 2256 { 2257 readSources.put(source); 2258 assert(source.isOpen); 2259 assert(source.file.isOpen); 2260 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 2261 assert(readSources.data[fileNum].isOpen); 2262 2263 assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]); 2264 2265 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 2266 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 2267 assert(source.header(No.keepTerminator) == 2268 fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 2269 2270 assert(source.name == inputFiles[fileNum]); 2271 assert(!source.isStdin); 2272 assert(source.isReadHeaderEnabled); 2273 2274 assert(source.file.rawRead(buffer) == fileBodies[fileNum]); 2275 } 2276 2277 /* The InputSourceRange is a reference range, consumed by the foreach. */ 2278 assert(inputSourcesYesHeader.empty); 2279 2280 /* Without reading headers. */ 2281 2282 readSources.clear; 2283 auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader); 2284 assert(inputSourcesNoHeader.length == numFiles); 2285 2286 foreach(fileNum, source; inputSourcesNoHeader.enumerate) 2287 { 2288 readSources.put(source); 2289 assert(source.isOpen); 2290 assert(source.file.isOpen); 2291 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 2292 assert(readSources.data[fileNum].isOpen); 2293 2294 assert(source.header(Yes.keepTerminator).empty); 2295 assert(source.header(No.keepTerminator).empty); 2296 2297 assert(source.name == inputFiles[fileNum]); 2298 assert(!source.isStdin); 2299 assert(!source.isReadHeaderEnabled); 2300 2301 assert(source.file.rawRead(buffer) == fileData[fileNum]); 2302 } 2303 2304 /* The InputSourceRange is a reference range, consumed by the foreach. */ 2305 assert(inputSourcesNoHeader.empty); 2306 } 2307 2308 /* Tests with standard input. No actual reading in these tests. 2309 */ 2310 2311 readSources.clear; 2312 foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate) 2313 { 2314 readSources.put(source); 2315 assert(source.isOpen); 2316 assert(source.file.isOpen); 2317 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); // InputSource objects are "closed". 2318 assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen)); // Actual stdin should not be closed. 2319 assert(readSources.data[fileNum].isOpen); 2320 2321 assert(source.header(Yes.keepTerminator).empty); 2322 assert(source.header(No.keepTerminator).empty); 2323 2324 assert(source.name == "Standard Input"); 2325 assert(source.isStdin); 2326 } 2327 2328 /* Empty filelist. */ 2329 string[] nofiles; 2330 { 2331 auto sources = inputSourceRange(nofiles, No.readHeader); 2332 assert(sources.empty); 2333 } 2334 { 2335 auto sources = inputSourceRange(nofiles, Yes.readHeader); 2336 assert(sources.empty); 2337 } 2338 2339 /* Error cases. */ 2340 assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each); 2341 assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each); 2342 } 2343 2344 /** 2345 byLineSourceRange is a helper function for creating new byLineSourceRange objects. 2346 */ 2347 auto byLineSourceRange( 2348 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 2349 (string[] filepaths) 2350 if (is(Char == char) || is(Char == ubyte)) 2351 { 2352 return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths); 2353 } 2354 2355 /** 2356 ByLineSourceRange is an input range that iterates over a set of input files. It 2357 provides bufferedByLine access to each file. 2358 2359 A ByLineSourceRange is used to iterate over a set of files passed on the command line. 2360 Files are automatically opened and closed during iteration. The front element of the 2361 range provides access to a bufferedByLine for iterating over the lines in the file. 2362 2363 The range is created from a set of filepaths. These filepaths are mapped to 2364 ByLineSource objects during the iteration. This is what enables automatically opening 2365 and closing files and providing bufferedByLine access. 2366 2367 The motivation behind ByLineSourceRange is to provide a standard way to look at the 2368 header line of the first input file during command line argument processing, and then 2369 pass the open input file along to the main processing functions. This enables 2370 features like named fields to be implemented in a standard way. 2371 2372 Access to the first line of the first file is available after creating the 2373 ByLineSourceRange instance. The first file is opened and a bufferedByLine created. 2374 The first line of the first file is via byLine.front (after checking !byLine.empty). 2375 2376 Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use 2377 limited to a single iteration over the set of files. The files can be iterated again 2378 by creating a new InputSourceRange against the same filepaths. 2379 2380 Currently, ByLineSourceRange supports files and standard input. It is possible other 2381 types of input sources will be added in the future. 2382 */ 2383 final class ByLineSourceRange( 2384 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 2385 if (is(Char == char) || is(Char == ubyte)) 2386 { 2387 import std.range; 2388 2389 alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator); 2390 2391 private string[] _filepaths; 2392 private ByLineSourceType _front; 2393 2394 this(string[] filepaths) 2395 { 2396 _filepaths = filepaths.dup; 2397 _front = null; 2398 2399 if (!_filepaths.empty) 2400 { 2401 _front = new ByLineSourceType(_filepaths.front); 2402 _front.open; 2403 _filepaths.popFront; 2404 } 2405 } 2406 2407 size_t length() const pure nothrow @safe 2408 { 2409 return empty ? 0 : _filepaths.length + 1; 2410 } 2411 2412 bool empty() const pure nothrow @safe 2413 { 2414 return _front is null; 2415 } 2416 2417 ByLineSourceType front() pure @safe 2418 { 2419 assert(!empty, "Attempt to take the front of an empty ByLineSourceRange"); 2420 return _front; 2421 } 2422 2423 void popFront() 2424 { 2425 assert(!empty, "Attempt to popFront an empty ByLineSourceRange"); 2426 2427 _front.close; 2428 2429 if (!_filepaths.empty) 2430 { 2431 _front = new ByLineSourceType(_filepaths.front); 2432 _front.open; 2433 _filepaths.popFront; 2434 } 2435 else 2436 { 2437 _front = null; 2438 } 2439 } 2440 } 2441 2442 /** 2443 ByLineSource is a class of objects produced by iterating over an ByLineSourceRange. 2444 2445 A ByLineSource instance provides a bufferedByLine range for the current the front 2446 element of a ByLineSourceRange. The main methods application code is likely to 2447 need are: 2448 2449 $(LIST 2450 * `byLine()` - Returns the bufferedByLine range accessing the open file. The file 2451 will be open for reading (using the bufferedByLine range) as long as the 2452 ByLineSource instance is the front element of the ByLineSourceRange 2453 it came from. 2454 2455 * `name()` - The name of the input source. The name returned is intended for 2456 user error messages. For files, this is the filepath that was passed to 2457 ByLineSourceRange. For standard input, it is "Standard Input". 2458 ) 2459 2460 A ByLineSource is a reference object, so the copies have the same state as the 2461 ByLineSourceRange front element. In particular, all copies will have the open 2462 state of the front element of the ByLineSourceRange. 2463 2464 This class is not intended for use outside the context of an ByLineSourceRange. 2465 */ 2466 final class ByLineSource( 2467 KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n') 2468 if (is(Char == char) || is(Char == ubyte)) 2469 { 2470 import std.range; 2471 import std.stdio; 2472 import std.traits : ReturnType; 2473 2474 alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator); 2475 alias ByLineType = ReturnType!newByLineFn; 2476 2477 private immutable string _filepath; 2478 private immutable bool _isStdin; 2479 private bool _isOpen; 2480 private bool _hasBeenOpened; 2481 private File _file; 2482 private ByLineType _byLineRange; 2483 2484 private this(string filepath) pure nothrow @safe 2485 { 2486 _filepath = filepath; 2487 _isStdin = filepath == "-"; 2488 _isOpen = false; 2489 _hasBeenOpened = false; 2490 } 2491 2492 /** byLine returns the bufferedByLine object held by the ByLineSource instance. 2493 * 2494 * The File underlying the BufferedByLine object is open for reading as long as 2495 * the ByLineSource instance is the front element of the ByLineSourceRange it 2496 * came from. 2497 */ 2498 ByLineType byLine() nothrow @safe 2499 { 2500 return _byLineRange; 2501 } 2502 2503 /** name returns a user friendly name representing the underlying input source. 2504 * 2505 * For files, it is the filepath provided to ByLineSourceRange. For standard 2506 * input, it is "Standard Input". (Use isStdin() to test for standard input, 2507 * compare against name().) 2508 */ 2509 string name() const pure nothrow @safe 2510 { 2511 return _isStdin ? "Standard Input" : _filepath; 2512 } 2513 2514 /** isStdin returns true if the underlying input source is Standard Input, false 2515 * otherwise. 2516 */ 2517 bool isStdin() const pure nothrow @safe 2518 { 2519 return _isStdin; 2520 } 2521 2522 /** isOpen returns true if the ByLineSource instance is open for reading, false 2523 * otherwise. 2524 * 2525 * "Open" in this context is whether the ByLineSource object is currently "open". 2526 * The underlying input source backing it does not necessarily have the same 2527 * state. The ByLineSource instance is "open" if is the front element of the 2528 * ByLineSourceRange that created it. 2529 * 2530 * The underlying input source object follows the same open/close state as makes 2531 * sense. In particular, real files are closed when the ByLineSource object is 2532 * closed. The exception is standard input, which is never actually closed. 2533 */ 2534 bool isOpen() const pure nothrow @safe 2535 { 2536 return _isOpen; 2537 } 2538 2539 private void open() 2540 { 2541 assert(!_isOpen); 2542 assert(!_hasBeenOpened); 2543 2544 _file = isStdin ? stdin : _filepath.File("rb"); 2545 _byLineRange = newByLineFn(_file); 2546 _isOpen = true; 2547 _hasBeenOpened = true; 2548 } 2549 2550 private void close() 2551 { 2552 if (!_isStdin) _file.close; 2553 _isOpen = false; 2554 } 2555 } 2556 2557 // ByLineSourceRange and ByLineSource 2558 unittest 2559 { 2560 import std.algorithm : all, each; 2561 import std.array : appender; 2562 import std.exception : assertThrown; 2563 import std.file : rmdirRecurse; 2564 import std.path : buildPath; 2565 import std.range; 2566 import std.stdio; 2567 import tsv_utils.common.unittest_utils; 2568 2569 auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range"); 2570 scope(exit) testDir.rmdirRecurse; 2571 2572 string file0 = buildPath(testDir, "file0.txt"); 2573 string file1 = buildPath(testDir, "file1.txt"); 2574 string file2 = buildPath(testDir, "file2.txt"); 2575 string file3 = buildPath(testDir, "file3.txt"); 2576 2577 string file0Header = ""; 2578 string file1Header = "file 1 header\n"; 2579 string file2Header = "file 2 header\n"; 2580 string file3Header = "file 3 header\n"; 2581 2582 string file0Body = ""; 2583 string file1Body = ""; 2584 string file2Body = "file 2 line 1\n"; 2585 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 2586 2587 string file0Data = file0Header ~ file0Body; 2588 string file1Data = file1Header ~ file1Body; 2589 string file2Data = file2Header ~ file2Body; 2590 string file3Data = file3Header ~ file3Body; 2591 2592 { 2593 file0.File("w").write(file0Data); 2594 file1.File("w").write(file1Data); 2595 file2.File("w").write(file2Data); 2596 file3.File("w").write(file3Data); 2597 } 2598 2599 auto inputFiles = [file0, file1, file2, file3]; 2600 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 2601 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 2602 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 2603 2604 auto buffer = new char[1024]; // Must be large enough to hold the test files. 2605 2606 /* Tests without standard input. Don't want to count on state of standard 2607 * input or modifying it when doing unit tests, so avoid reading from it. 2608 */ 2609 2610 auto readSourcesNoTerminator = appender!(ByLineSource!(No.keepTerminator)[]); 2611 auto readSourcesYesTerminator = appender!(ByLineSource!(Yes.keepTerminator)[]); 2612 2613 foreach(numFiles; 1 .. inputFiles.length + 1) 2614 { 2615 /* Using No.keepTerminator. */ 2616 readSourcesNoTerminator.clear; 2617 auto inputSourcesNoTerminator = byLineSourceRange!(No.keepTerminator)(inputFiles[0 .. numFiles]); 2618 assert(inputSourcesNoTerminator.length == numFiles); 2619 2620 foreach(fileNum, source; inputSourcesNoTerminator.enumerate) 2621 { 2622 readSourcesNoTerminator.put(source); 2623 assert(source.isOpen); 2624 assert(source._file.isOpen); 2625 assert(readSourcesNoTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2626 assert(readSourcesNoTerminator.data[fileNum].isOpen); 2627 2628 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 2629 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 2630 2631 assert(source.byLine.empty || 2632 source.byLine.front == fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 2633 2634 assert(source.name == inputFiles[fileNum]); 2635 assert(!source.isStdin); 2636 2637 auto readFileData = appender!(char[]); 2638 foreach(line; source.byLine) 2639 { 2640 readFileData.put(line); 2641 readFileData.put('\n'); 2642 } 2643 2644 assert(readFileData.data == fileData[fileNum]); 2645 } 2646 2647 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2648 assert(inputSourcesNoTerminator.empty); 2649 2650 /* Using Yes.keepTerminator. */ 2651 readSourcesYesTerminator.clear; 2652 auto inputSourcesYesTerminator = byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles]); 2653 assert(inputSourcesYesTerminator.length == numFiles); 2654 2655 foreach(fileNum, source; inputSourcesYesTerminator.enumerate) 2656 { 2657 readSourcesYesTerminator.put(source); 2658 assert(source.isOpen); 2659 assert(source._file.isOpen); 2660 assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2661 assert(readSourcesYesTerminator.data[fileNum].isOpen); 2662 2663 assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]); 2664 2665 assert(source.name == inputFiles[fileNum]); 2666 assert(!source.isStdin); 2667 2668 auto readFileData = appender!(char[]); 2669 foreach(line; source.byLine) 2670 { 2671 readFileData.put(line); 2672 } 2673 2674 assert(readFileData.data == fileData[fileNum]); 2675 } 2676 2677 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2678 assert(inputSourcesYesTerminator.empty); 2679 } 2680 2681 /* Empty filelist. */ 2682 string[] nofiles; 2683 { 2684 auto sources = byLineSourceRange!(No.keepTerminator)(nofiles); 2685 assert(sources.empty); 2686 } 2687 { 2688 auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles); 2689 assert(sources.empty); 2690 } 2691 2692 /* Error cases. */ 2693 assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each); 2694 assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each); 2695 }