1 /** 2 Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange, 3 and a several others. 4 5 Utilities in this file: 6 $(LIST 7 * [InputFieldReordering] - A class that creates a reordered subset of fields from 8 an input line. Fields in the subset are accessed by array indicies. This is 9 especially useful when processing the subset in a specific order, such as the 10 order listed on the command-line at run-time. 11 12 * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer 13 output. Intended for use with stdout, it is a significant performance benefit. 14 15 * [isFlushableOutputRange] - Tests if something is an OutputRange with a flush 16 member. 17 18 * [bufferedByLine] - An input range that reads from a File handle line by line. 19 It is similar to the standard library method std.stdio.File.byLine, but quite a 20 bit faster. This is achieved by reading in larger blocks and buffering. 21 22 * [InputSourceRange] - An input range that provides open file access to a set of 23 files. It is used to iterate over files passed as command line arguments. This 24 enable reading header line of a file during command line argument process, then 25 passing the open file to the main processing functions. 26 27 * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides 28 access to a byLine iterator (bufferedByLine) rather than an open file. This is 29 used by tools that run the same processing logic both header non-header lines. 30 31 * [joinAppend] - A function that performs a join, but appending the join output to 32 an output stream. It is a performance improvement over using join or joiner with 33 writeln. 34 35 * [getTsvFieldValue] - A convenience function when only a single value is needed 36 from an input line. 37 38 * [throwIfWindowsNewlineOnUnix] - A utility for Unix platform builds to detecting 39 Windows newlines in input. 40 ) 41 42 Copyright (c) 2015-2020, eBay Inc. 43 Initially written by Jon Degenhardt 44 45 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 46 */ 47 48 module tsv_utils.common.utils; 49 50 import std.range; 51 import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned, ReturnType; 52 import std.typecons : Flag, No, Yes; 53 54 // InputFieldReording class. 55 56 /** Flag used by the InputFieldReordering template. */ 57 alias EnablePartialLines = Flag!"enablePartialLines"; 58 59 /** 60 InputFieldReordering - Move select fields from an input line to an output array, 61 reordering along the way. 62 63 The InputFieldReordering class is used to reorder a subset of fields from an input line. 64 The caller instantiates an InputFieldReordering object at the start of input processing. 65 The instance contains a mapping from input index to output index, plus a buffer holding 66 the reordered fields. The caller processes each input line by calling initNewLine, 67 splitting the line into fields, and calling processNextField on each field. The output 68 buffer is ready when the allFieldsFilled method returns true. 69 70 Fields are not copied, instead the output buffer points to the fields passed by the caller. 71 The caller needs to use or copy the output buffer while the fields are still valid, which 72 is normally until reading the next input line. The program below illustrates the basic use 73 case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend, 74 below, which has a performance improvement over join used here.) 75 76 --- 77 int main(string[] args) 78 { 79 import tsv_utils.common.utils; 80 import std.algorithm, std.array, std.range, std.stdio; 81 size_t[] fieldIndicies = [3, 0, 2]; 82 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 83 foreach (line; stdin.byLine) 84 { 85 fieldReordering.initNewLine; 86 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 87 { 88 fieldReordering.processNextField(fieldIndex, fieldValue); 89 if (fieldReordering.allFieldsFilled) break; 90 } 91 if (fieldReordering.allFieldsFilled) 92 { 93 writeln(fieldReordering.outputFields.join('\t')); 94 } 95 else 96 { 97 writeln("Error: Insufficient number of field on the line."); 98 } 99 } 100 return 0; 101 } 102 --- 103 104 Field indicies are zero-based. An individual field can be listed multiple times. The 105 outputFields array is not valid until all the specified fields have been processed. The 106 allFieldsFilled method tests this. If a line does not have enough fields the outputFields 107 buffer cannot be used. For most TSV applications this is okay, as it means the line is 108 invalid and cannot be used. However, if partial lines are okay, the template can be 109 instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in 110 are empty strings in the outputFields return. 111 */ 112 final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no) 113 if (isSomeChar!C) 114 { 115 /* Implementation: The class works by creating an array of tuples mapping the input 116 * field index to the location in the outputFields array. The 'fromToMap' array is 117 * sorted in input field order, enabling placement in the outputFields buffer during a 118 * pass over the input fields. The map is created by the constructor. An example: 119 * 120 * inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9] 121 * fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>] 122 * 123 * During processing of an a line, an array slice, mapStack, is used to track how 124 * much of the fromToMap remains to be processed. 125 */ 126 import std.range; 127 import std.typecons : Tuple; 128 129 alias TupleFromTo = Tuple!(size_t, "from", size_t, "to"); 130 131 private C[][] outputFieldsBuf; 132 private TupleFromTo[] fromToMap; 133 private TupleFromTo[] mapStack; 134 135 final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe 136 { 137 import std.algorithm : sort; 138 139 outputFieldsBuf = new C[][](inputFieldIndicies.length); 140 fromToMap.reserve(inputFieldIndicies.length); 141 142 foreach (to, from; inputFieldIndicies.enumerate(start)) 143 { 144 fromToMap ~= TupleFromTo(from, to); 145 } 146 147 sort(fromToMap); 148 initNewLine; 149 } 150 151 /** initNewLine initializes the object for a new line. */ 152 final void initNewLine() pure nothrow @safe 153 { 154 mapStack = fromToMap; 155 static if (partialLinesOk) 156 { 157 import std.algorithm : each; 158 outputFieldsBuf.each!((ref s) => s.length = 0); 159 } 160 } 161 162 /** processNextField maps an input field to the correct locations in the 163 * outputFields array. 164 * 165 * processNextField should be called once for each field on the line, in the order 166 * found. The processing of the line can terminate once allFieldsFilled returns 167 * true. 168 * 169 * The return value is the number of output fields the input field maps to. Zero 170 * means the field is not mapped to the output fields array. 171 * 172 * If, prior to allFieldsProcessed returning true, any fields on the input line 173 * are not passed to processNextField, the caller should either ensure the fields 174 * are not part of the output fields or have partial lines enabled. 175 */ 176 final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc 177 { 178 size_t numFilled = 0; 179 while (!mapStack.empty && fieldIndex == mapStack.front.from) 180 { 181 outputFieldsBuf[mapStack.front.to] = fieldValue; 182 mapStack.popFront; 183 numFilled++; 184 } 185 return numFilled; 186 } 187 188 /** allFieldsFilled returned true if all fields expected have been processed. */ 189 final bool allFieldsFilled() const pure nothrow @safe @nogc 190 { 191 return mapStack.empty; 192 } 193 194 /** outputFields is the assembled output fields. Unless partial lines are enabled, 195 * it is only valid after allFieldsFilled is true. 196 */ 197 final C[][] outputFields() pure nothrow @safe @nogc 198 { 199 return outputFieldsBuf[]; 200 } 201 } 202 203 // InputFieldReordering - Tests using different character types. 204 @safe unittest 205 { 206 import std.conv : to; 207 208 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 209 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 210 ["r3f0", "123", "456", "789"]]; 211 212 size_t[] fields_2_0 = [2, 0]; 213 214 auto expected_2_0 = [["r1f2", "r1f0"], 215 ["ÀBCßßZ", "r2f0"], 216 ["456", "r3f0"]]; 217 218 char[][][] charExpected_2_0 = to!(char[][][])(expected_2_0); 219 wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0); 220 dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0); 221 dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0); 222 223 auto charIFR = new InputFieldReordering!char(fields_2_0); 224 auto wcharIFR = new InputFieldReordering!wchar(fields_2_0); 225 auto dcharIFR = new InputFieldReordering!dchar(fields_2_0); 226 227 foreach (lineIndex, line; inputLines) 228 { 229 charIFR.initNewLine; 230 wcharIFR.initNewLine; 231 dcharIFR.initNewLine; 232 233 foreach (fieldIndex, fieldValue; line) 234 { 235 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 236 wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue)); 237 dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue)); 238 239 assert ((fieldIndex >= 2) == charIFR.allFieldsFilled); 240 assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled); 241 assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled); 242 } 243 assert(charIFR.allFieldsFilled); 244 assert(wcharIFR.allFieldsFilled); 245 assert(dcharIFR.allFieldsFilled); 246 247 assert(charIFR.outputFields == charExpected_2_0[lineIndex]); 248 assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]); 249 assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]); 250 } 251 } 252 253 // InputFieldReordering - Test of partial line support. 254 @safe unittest 255 { 256 import std.conv : to; 257 258 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 259 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 260 ["r3f0", "123", "456", "789"]]; 261 262 size_t[] fields_2_0 = [2, 0]; 263 264 // The expected states of the output field while each line and field are processed. 265 auto expectedBylineByfield_2_0 = 266 [ 267 [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"], ["r1f2", "r1f0"]], 268 [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]], 269 [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"], ["456", "r3f0"]], 270 ]; 271 272 char[][][][] charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0); 273 274 auto charIFR = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0); 275 276 foreach (lineIndex, line; inputLines) 277 { 278 charIFR.initNewLine; 279 foreach (fieldIndex, fieldValue; line) 280 { 281 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 282 assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]); 283 } 284 } 285 } 286 287 // InputFieldReordering - Field combination tests. 288 @safe unittest 289 { 290 import std.conv : to; 291 import std.stdio; 292 293 auto inputLines = [["00", "01", "02", "03"], 294 ["10", "11", "12", "13"], 295 ["20", "21", "22", "23"]]; 296 297 size_t[] fields_0 = [0]; 298 size_t[] fields_3 = [3]; 299 size_t[] fields_01 = [0, 1]; 300 size_t[] fields_10 = [1, 0]; 301 size_t[] fields_03 = [0, 3]; 302 size_t[] fields_30 = [3, 0]; 303 size_t[] fields_0123 = [0, 1, 2, 3]; 304 size_t[] fields_3210 = [3, 2, 1, 0]; 305 size_t[] fields_03001 = [0, 3, 0, 0, 1]; 306 307 auto expected_0 = to!(char[][][])([["00"], 308 ["10"], 309 ["20"]]); 310 311 auto expected_3 = to!(char[][][])([["03"], 312 ["13"], 313 ["23"]]); 314 315 auto expected_01 = to!(char[][][])([["00", "01"], 316 ["10", "11"], 317 ["20", "21"]]); 318 319 auto expected_10 = to!(char[][][])([["01", "00"], 320 ["11", "10"], 321 ["21", "20"]]); 322 323 auto expected_03 = to!(char[][][])([["00", "03"], 324 ["10", "13"], 325 ["20", "23"]]); 326 327 auto expected_30 = to!(char[][][])([["03", "00"], 328 ["13", "10"], 329 ["23", "20"]]); 330 331 auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"], 332 ["10", "11", "12", "13"], 333 ["20", "21", "22", "23"]]); 334 335 auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"], 336 ["13", "12", "11", "10"], 337 ["23", "22", "21", "20"]]); 338 339 auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"], 340 ["10", "13", "10", "10", "11"], 341 ["20", "23", "20", "20", "21"]]); 342 343 auto ifr_0 = new InputFieldReordering!char(fields_0); 344 auto ifr_3 = new InputFieldReordering!char(fields_3); 345 auto ifr_01 = new InputFieldReordering!char(fields_01); 346 auto ifr_10 = new InputFieldReordering!char(fields_10); 347 auto ifr_03 = new InputFieldReordering!char(fields_03); 348 auto ifr_30 = new InputFieldReordering!char(fields_30); 349 auto ifr_0123 = new InputFieldReordering!char(fields_0123); 350 auto ifr_3210 = new InputFieldReordering!char(fields_3210); 351 auto ifr_03001 = new InputFieldReordering!char(fields_03001); 352 353 foreach (lineIndex, line; inputLines) 354 { 355 ifr_0.initNewLine; 356 ifr_3.initNewLine; 357 ifr_01.initNewLine; 358 ifr_10.initNewLine; 359 ifr_03.initNewLine; 360 ifr_30.initNewLine; 361 ifr_0123.initNewLine; 362 ifr_3210.initNewLine; 363 ifr_03001.initNewLine; 364 365 foreach (fieldIndex, fieldValue; line) 366 { 367 ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue)); 368 ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue)); 369 ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue)); 370 ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue)); 371 ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue)); 372 ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue)); 373 ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue)); 374 ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue)); 375 ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue)); 376 } 377 378 assert(ifr_0.outputFields == expected_0[lineIndex]); 379 assert(ifr_3.outputFields == expected_3[lineIndex]); 380 assert(ifr_01.outputFields == expected_01[lineIndex]); 381 assert(ifr_10.outputFields == expected_10[lineIndex]); 382 assert(ifr_03.outputFields == expected_03[lineIndex]); 383 assert(ifr_30.outputFields == expected_30[lineIndex]); 384 assert(ifr_0123.outputFields == expected_0123[lineIndex]); 385 assert(ifr_3210.outputFields == expected_3210[lineIndex]); 386 assert(ifr_03001.outputFields == expected_03001[lineIndex]); 387 } 388 } 389 390 391 import std.stdio : File, isFileHandle, KeepTerminator; 392 import std.range : isOutputRange; 393 import std.traits : Unqual; 394 395 /** 396 BufferedOutputRange is a performance enhancement over writing directly to an output 397 stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an 398 internal buffer and written to the output stream as a block. 399 400 Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster 401 than writing to stdout directly. This is especially noticable for outputs with short 402 lines, as it blocks many writes together in a single write. 403 404 The internal buffer is written to the output stream after flushSize has been reached. 405 This is checked at newline boundaries, when appendln is called or when put is called 406 with a single newline character. Other writes check maxSize, which is used to avoid 407 runaway buffers. 408 409 410 BufferedOutputRange has a put method allowing it to be used a range. It has a number 411 of other methods providing additional control. 412 413 $(LIST 414 * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the 415 output stream, e.g. stdout. Other arguments are optional, defaults normally suffice. 416 417 * `append(stuff)` - Append to the internal buffer. 418 419 * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer 420 is flushed to the output stream if is has reached flushSize. 421 422 * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the 423 output stream if is has reached flushSize. 424 425 * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`. 426 For reasons that are not clear, joiner is quite slow. 427 428 * `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been 429 reached. 430 431 * `flush()` - Write the internal buffer to the output stream. 432 433 * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single 434 newline character, '\n' or "\n". 435 ) 436 437 The internal buffer is automatically flushed when the BufferedOutputRange goes out of 438 scope. 439 */ 440 struct BufferedOutputRange(OutputTarget) 441 if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char)) 442 { 443 import std.range : isOutputRange; 444 import std.array : appender; 445 import std.format : format; 446 447 /* Identify the output element type. Only supporting char and ubyte for now. */ 448 static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char)) 449 { 450 alias C = char; 451 } 452 else static if (isOutputRange!(OutputTarget, ubyte)) 453 { 454 alias C = ubyte; 455 } 456 else static assert(false); 457 458 private enum defaultReserveSize = 11264; 459 private enum defaultFlushSize = 10240; 460 private enum defaultMaxSize = 4194304; 461 462 private OutputTarget _outputTarget; 463 private auto _outputBuffer = appender!(C[]); 464 private immutable size_t _flushSize; 465 private immutable size_t _maxSize; 466 467 this(OutputTarget outputTarget, 468 size_t flushSize = defaultFlushSize, 469 size_t reserveSize = defaultReserveSize, 470 size_t maxSize = defaultMaxSize) 471 { 472 assert(flushSize <= maxSize); 473 474 _outputTarget = outputTarget; 475 _flushSize = flushSize; 476 _maxSize = (flushSize <= maxSize) ? maxSize : flushSize; 477 _outputBuffer.reserve(reserveSize); 478 } 479 480 ~this() 481 { 482 flush(); 483 } 484 485 void flush() 486 { 487 static if (isFileHandle!OutputTarget) _outputTarget.write(_outputBuffer.data); 488 else _outputTarget.put(_outputBuffer.data); 489 490 _outputBuffer.clear; 491 } 492 493 bool flushIfFull() 494 { 495 bool isFull = _outputBuffer.data.length >= _flushSize; 496 if (isFull) flush(); 497 return isFull; 498 } 499 500 /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */ 501 void flushIfMaxSize() 502 { 503 if (_outputBuffer.data.length >= _maxSize) flush(); 504 } 505 506 /* maybeFlush is intended for the case where put is called with a trailing newline. 507 * 508 * Flushing occurs if the buffer has a trailing newline and has reached flush size. 509 * Flushing also occurs if the buffer has reached max size. 510 */ 511 private bool maybeFlush() 512 { 513 immutable bool doFlush = 514 _outputBuffer.data.length >= _flushSize && 515 (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize); 516 517 if (doFlush) flush(); 518 return doFlush; 519 } 520 521 522 private void appendRaw(T)(T stuff) pure 523 { 524 import std.range : rangePut = put; 525 rangePut(_outputBuffer, stuff); 526 } 527 528 void append(T)(T stuff) 529 { 530 appendRaw(stuff); 531 maybeFlush(); 532 } 533 534 bool appendln() 535 { 536 appendRaw('\n'); 537 return flushIfFull(); 538 } 539 540 bool appendln(T)(T stuff) 541 { 542 appendRaw(stuff); 543 return appendln(); 544 } 545 546 /* joinAppend is an optimization of append(inputRange.joiner(delimiter). 547 * This form is quite a bit faster, 40%+ on some benchmarks. 548 */ 549 void joinAppend(InputRange, E)(InputRange inputRange, E delimiter) 550 if (isInputRange!InputRange && 551 is(ElementType!InputRange : const C[]) && 552 (is(E : const C[]) || is(E : const C))) 553 { 554 if (!inputRange.empty) 555 { 556 appendRaw(inputRange.front); 557 inputRange.popFront; 558 } 559 foreach (x; inputRange) 560 { 561 appendRaw(delimiter); 562 appendRaw(x); 563 } 564 flushIfMaxSize(); 565 } 566 567 /* Make this an output range. */ 568 void put(T)(T stuff) 569 { 570 import std.traits; 571 import std.stdio; 572 573 static if (isSomeChar!T) 574 { 575 if (stuff == '\n') appendln(); 576 else appendRaw(stuff); 577 } 578 else static if (isSomeString!T) 579 { 580 if (stuff == "\n") appendln(); 581 else append(stuff); 582 } 583 else append(stuff); 584 } 585 } 586 587 // BufferedOutputRange. 588 unittest 589 { 590 import tsv_utils.common.unittest_utils; 591 import std.file : rmdirRecurse, readText; 592 import std.path : buildPath; 593 594 auto testDir = makeUnittestTempDir("tsv_utils_buffered_output"); 595 scope(exit) testDir.rmdirRecurse; 596 597 import std.algorithm : map, joiner; 598 import std.range : iota; 599 import std.conv : to; 600 601 /* Basic test. Note that exiting the scope triggers flush. */ 602 string filepath1 = buildPath(testDir, "file1.txt"); 603 { 604 import std.stdio : File; 605 606 auto ostream = BufferedOutputRange!File(filepath1.File("w")); 607 ostream.append("file1: "); 608 ostream.append("abc"); 609 ostream.append(["def", "ghi", "jkl"]); 610 ostream.appendln(100.to!string); 611 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 612 ostream.appendln(); 613 } 614 assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 615 616 /* Test with no reserve and no flush at every line. */ 617 string filepath2 = buildPath(testDir, "file2.txt"); 618 { 619 import std.stdio : File; 620 621 auto ostream = BufferedOutputRange!File(filepath2.File("w"), 0, 0); 622 ostream.append("file2: "); 623 ostream.append("abc"); 624 ostream.append(["def", "ghi", "jkl"]); 625 ostream.appendln("100"); 626 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 627 ostream.appendln(); 628 } 629 assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 630 631 /* With a locking text writer. Requires version 2.078.0 632 See: https://issues.dlang.org/show_bug.cgi?id=9661 633 */ 634 static if (__VERSION__ >= 2078) 635 { 636 string filepath3 = buildPath(testDir, "file3.txt"); 637 { 638 import std.stdio : File; 639 640 auto ltw = filepath3.File("w").lockingTextWriter; 641 { 642 auto ostream = BufferedOutputRange!(typeof(ltw))(ltw); 643 ostream.append("file3: "); 644 ostream.append("abc"); 645 ostream.append(["def", "ghi", "jkl"]); 646 ostream.appendln("100"); 647 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 648 ostream.appendln(); 649 } 650 } 651 assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 652 } 653 654 /* With an Appender. */ 655 import std.array : appender; 656 auto app1 = appender!(char[]); 657 { 658 auto ostream = BufferedOutputRange!(typeof(app1))(app1); 659 ostream.append("appender1: "); 660 ostream.append("abc"); 661 ostream.append(["def", "ghi", "jkl"]); 662 ostream.appendln("100"); 663 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 664 ostream.appendln(); 665 } 666 assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 667 668 /* With an Appender, but checking flush boundaries. */ 669 auto app2 = appender!(char[]); 670 { 671 auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+ 672 bool wasFlushed = false; 673 674 assert(app2.data == ""); 675 676 ostream.append("12345678"); // Not flushed yet. 677 assert(app2.data == ""); 678 679 wasFlushed = ostream.appendln; // Nineth char, not flushed yet. 680 assert(!wasFlushed); 681 assert(app2.data == ""); 682 683 wasFlushed = ostream.appendln; // Tenth char, now flushed. 684 assert(wasFlushed); 685 assert(app2.data == "12345678\n\n"); 686 687 app2.clear; 688 assert(app2.data == ""); 689 690 ostream.append("12345678"); 691 692 wasFlushed = ostream.flushIfFull; 693 assert(!wasFlushed); 694 assert(app2.data == ""); 695 696 ostream.flush; 697 assert(app2.data == "12345678"); 698 699 app2.clear; 700 assert(app2.data == ""); 701 702 ostream.append("123456789012345"); 703 assert(app2.data == ""); 704 } 705 assert(app2.data == "123456789012345"); 706 707 /* Using joinAppend. */ 708 auto app1b = appender!(char[]); 709 { 710 auto ostream = BufferedOutputRange!(typeof(app1b))(app1b); 711 ostream.append("appenderB: "); 712 ostream.joinAppend(["a", "bc", "def"], '-'); 713 ostream.append(':'); 714 ostream.joinAppend(["g", "hi", "jkl"], '-'); 715 ostream.appendln("*100*"); 716 ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' '); 717 ostream.append(' '); 718 ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " "); 719 ostream.appendln(); 720 } 721 assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n", 722 "app1b.data: |" ~app1b.data ~ "|"); 723 724 /* Operating as an output range. When passed to a function as a ref, exiting 725 * the function does not flush. When passed as a value, it get flushed when 726 * the function returns. Also test both UCFS and non-UFCS styles. 727 */ 728 729 void outputStuffAsRef(T)(ref T range) 730 if (isOutputRange!(T, char)) 731 { 732 range.put('1'); 733 put(range, "23"); 734 range.put('\n'); 735 range.put(["5", "67"]); 736 put(range, iota(8, 10).map!(x => x.to!string)); 737 put(range, "\n"); 738 } 739 740 void outputStuffAsVal(T)(T range) 741 if (isOutputRange!(T, char)) 742 { 743 put(range, '1'); 744 range.put("23"); 745 put(range, '\n'); 746 put(range, ["5", "67"]); 747 range.put(iota(8, 10).map!(x => x.to!string)); 748 range.put("\n"); 749 } 750 751 auto app3 = appender!(char[]); 752 { 753 auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0); 754 outputStuffAsRef(ostream); 755 assert(app3.data == "", "app3.data: |" ~app3.data ~ "|"); 756 outputStuffAsRef(ostream); 757 assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|"); 758 } 759 assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|"); 760 761 auto app4 = appender!(char[]); 762 { 763 auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0); 764 outputStuffAsVal(ostream); 765 assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 766 outputStuffAsVal(ostream); 767 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 768 } 769 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 770 771 /* Test maxSize. */ 772 auto app5 = appender!(char[]); 773 { 774 auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10 775 assert(app5.data == ""); 776 777 ostream.append("1234567"); // Not flushed yet (no newline). 778 assert(app5.data == ""); 779 780 ostream.append("89012"); // Flushed by maxSize 781 assert(app5.data == "123456789012"); 782 783 ostream.put("1234567"); // Not flushed yet (no newline). 784 assert(app5.data == "123456789012"); 785 786 ostream.put("89012"); // Flushed by maxSize 787 assert(app5.data == "123456789012123456789012"); 788 789 ostream.joinAppend(["ab", "cd"], '-'); // Not flushed yet 790 ostream.joinAppend(["de", "gh", "ij"], '-'); // Flushed by maxSize 791 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 792 } 793 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 794 } 795 796 /** 797 isFlushableOutputRange returns true if R is an output range with a flush member. 798 */ 799 enum bool isFlushableOutputRange(R, E=char) = isOutputRange!(R, E) 800 && is(ReturnType!((R r) => r.flush) == void); 801 802 @safe unittest 803 { 804 import std.array; 805 auto app = appender!(char[]); 806 auto ostream = BufferedOutputRange!(typeof(app))(app, 5, 0, 10); // maxSize 10 807 808 static assert(isOutputRange!(typeof(app), char)); 809 static assert(!isFlushableOutputRange!(typeof(app), char)); 810 static assert(!isFlushableOutputRange!(typeof(app))); 811 812 static assert(isOutputRange!(typeof(ostream), char)); 813 static assert(isFlushableOutputRange!(typeof(ostream), char)); 814 static assert(isFlushableOutputRange!(typeof(ostream))); 815 816 static assert(isOutputRange!(Appender!string, string)); 817 static assert(!isFlushableOutputRange!(Appender!string, string)); 818 static assert(!isFlushableOutputRange!(Appender!string)); 819 820 static assert(isOutputRange!(Appender!(char[]), char)); 821 static assert(!isFlushableOutputRange!(Appender!(char[]), char)); 822 static assert(!isFlushableOutputRange!(Appender!(char[]))); 823 824 static assert(isOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 825 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])))); 826 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 827 } 828 829 830 /** 831 bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by 832 reading a large buffer from the input stream rather than just a single line. 833 834 The file argument needs to be a File object open for reading, typically a filesystem 835 file or standard input. Use the Yes.keepTerminator template parameter to keep the 836 newline. This is similar to stdio.File.byLine, except specified as a template paramter 837 rather than a runtime parameter. 838 839 Reading in blocks does mean that input is not read until a full buffer is available or 840 end-of-file is reached. For this reason, bufferedByLine is not appropriate for 841 interactive input. 842 */ 843 844 auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char, 845 ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16) 846 (File file) 847 if (is(Char == char) || is(Char == ubyte)) 848 { 849 static assert(0 < growSize && growSize <= readSize); 850 851 static final class BufferedByLineImpl 852 { 853 /* Buffer state variables 854 * - _buffer.length - Full length of allocated buffer. 855 * - _dataEnd - End of currently valid data (end of last read). 856 * - _lineStart - Start of current line. 857 * - _lineEnd - End of current line. 858 */ 859 private File _file; 860 private ubyte[] _buffer; 861 private size_t _lineStart = 0; 862 private size_t _lineEnd = 0; 863 private size_t _dataEnd = 0; 864 865 this (File f) 866 { 867 _file = f; 868 _buffer = new ubyte[readSize + growSize]; 869 } 870 871 bool empty() const pure 872 { 873 return _file.eof && _lineStart == _dataEnd; 874 } 875 876 Char[] front() pure 877 { 878 assert(!empty, "Attempt to take the front of an empty bufferedByLine."); 879 880 static if (keepTerminator == Yes.keepTerminator) 881 { 882 return cast(Char[]) _buffer[_lineStart .. _lineEnd]; 883 } 884 else 885 { 886 assert(_lineStart < _lineEnd); 887 immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd; 888 return cast(Char[]) _buffer[_lineStart .. end]; 889 } 890 } 891 892 /* Note: Call popFront at initialization to do the initial read. */ 893 void popFront() 894 { 895 import std.algorithm: copy, find; 896 assert(!empty, "Attempt to popFront an empty bufferedByLine."); 897 898 /* Pop the current line. */ 899 _lineStart = _lineEnd; 900 901 /* Set up the next line if more data is available, either in the buffer or 902 * the file. The next line ends at the next newline, if there is one. 903 * 904 * Notes: 905 * - 'find' returns the slice starting with the character searched for, or 906 * an empty range if not found. 907 * - _lineEnd is set to _dataEnd both when the current buffer does not have 908 * a newline and when it ends with one. 909 */ 910 auto found = _buffer[_lineStart .. _dataEnd].find(terminator); 911 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 912 913 if (found.empty && !_file.eof) 914 { 915 /* No newline in current buffer. Read from the file until the next 916 * newline is found. 917 */ 918 assert(_lineEnd == _dataEnd); 919 920 if (_lineStart > 0) 921 { 922 /* Move remaining data to the start of the buffer. */ 923 immutable remainingLength = _dataEnd - _lineStart; 924 copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]); 925 _lineStart = 0; 926 _lineEnd = _dataEnd = remainingLength; 927 } 928 929 do 930 { 931 /* Grow the buffer if necessary. */ 932 immutable availableSize = _buffer.length - _dataEnd; 933 if (availableSize < readSize) 934 { 935 size_t growBy = growSize; 936 while (availableSize + growBy < readSize) growBy += growSize; 937 _buffer.length += growBy; 938 } 939 940 /* Read the next block. */ 941 _dataEnd += 942 _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize]) 943 .length; 944 945 found = _buffer[_lineEnd .. _dataEnd].find(terminator); 946 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 947 948 } while (found.empty && !_file.eof); 949 } 950 } 951 } 952 953 assert(file.isOpen, "bufferedByLine passed a closed file."); 954 955 auto r = new BufferedByLineImpl(file); 956 if (!r.empty) r.popFront; 957 return r; 958 } 959 960 // BufferedByLine. 961 unittest 962 { 963 import std.array : appender; 964 import std.conv : to; 965 import std.file : rmdirRecurse, readText; 966 import std.path : buildPath; 967 import std.range : lockstep; 968 import std.stdio; 969 import tsv_utils.common.unittest_utils; 970 971 auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline"); 972 scope(exit) testDir.rmdirRecurse; 973 974 /* Create two data files with the same data. Read both in parallel with byLine and 975 * bufferedByLine and compare each line. 976 */ 977 auto data1 = appender!(char[])(); 978 979 foreach (i; 1 .. 1001) data1.put('\n'); 980 foreach (i; 1 .. 1001) data1.put("a\n"); 981 foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); } 982 foreach (i; 1 .. 1001) 983 { 984 foreach (j; 1 .. i+1) data1.put('x'); 985 data1.put('\n'); 986 } 987 988 string file1a = buildPath(testDir, "file1a.txt"); 989 string file1b = buildPath(testDir, "file1b.txt"); 990 { 991 992 file1a.File("w").write(data1.data); 993 file1b.File("w").write(data1.data); 994 } 995 996 /* Default parameters. */ 997 { 998 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator); 999 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1000 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1001 } 1002 { 1003 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator); 1004 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1005 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1006 } 1007 1008 /* Smaller read size. This will trigger buffer growth. */ 1009 { 1010 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', 512, 256); 1011 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1012 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1013 } 1014 1015 /* Exercise boundary cases in buffer growth. 1016 * Note: static-foreach requires DMD 2.076 / LDC 1.6 1017 */ 1018 static foreach (readSize; [1, 2, 4]) 1019 { 1020 static foreach (growSize; 1 .. readSize + 1) 1021 {{ 1022 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1023 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1024 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1025 }} 1026 static foreach (growSize; 1 .. readSize + 1) 1027 {{ 1028 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1029 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1030 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1031 }} 1032 } 1033 1034 1035 /* Files that do not end in a newline. */ 1036 1037 string file2a = buildPath(testDir, "file2a.txt"); 1038 string file2b = buildPath(testDir, "file2b.txt"); 1039 string file3a = buildPath(testDir, "file3a.txt"); 1040 string file3b = buildPath(testDir, "file3b.txt"); 1041 string file4a = buildPath(testDir, "file4a.txt"); 1042 string file4b = buildPath(testDir, "file4b.txt"); 1043 { 1044 file1a.File("w").write("a"); 1045 file1b.File("w").write("a"); 1046 file2a.File("w").write("ab"); 1047 file2b.File("w").write("ab"); 1048 file3a.File("w").write("abc"); 1049 file3b.File("w").write("abc"); 1050 } 1051 1052 static foreach (readSize; [1, 2, 4]) 1053 { 1054 static foreach (growSize; 1 .. readSize + 1) 1055 {{ 1056 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1057 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1058 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1059 1060 auto f2aIn = file2a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1061 auto f2bIn = file2b.File().byLine(No.keepTerminator); 1062 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1063 1064 auto f3aIn = file3a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1065 auto f3bIn = file3b.File().byLine(No.keepTerminator); 1066 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1067 }} 1068 static foreach (growSize; 1 .. readSize + 1) 1069 {{ 1070 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1071 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1072 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1073 1074 auto f2aIn = file2a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1075 auto f2bIn = file2b.File().byLine(Yes.keepTerminator); 1076 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1077 1078 auto f3aIn = file3a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1079 auto f3bIn = file3b.File().byLine(Yes.keepTerminator); 1080 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1081 }} 1082 } 1083 } 1084 1085 /** 1086 joinAppend performs a join operation on an input range, appending the results to 1087 an output range. 1088 1089 joinAppend was written as a performance enhancement over using std.algorithm.joiner 1090 or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower 1091 than std.array.join with writeln. The joiner performance may be due to interaction 1092 with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter 1093 is better, but still substantially slower than join. Using join works reasonably well, 1094 but is allocating memory unnecessarily. 1095 1096 Using joinAppend with Appender is a bit faster than join, and allocates less memory. 1097 The Appender re-uses the underlying data buffer, saving memory. The example below 1098 illustrates. It is a modification of the InputFieldReordering example. The role 1099 Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange 1100 uses a similar technique to buffer multiple lines. 1101 1102 Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has 1103 its own joinAppend method. However, joinAppend remains useful when constructing internal 1104 buffers where BufferedOutputRange is not appropriate. 1105 1106 --- 1107 int main(string[] args) 1108 { 1109 import tsvutil; 1110 import std.algorithm, std.array, std.range, std.stdio; 1111 size_t[] fieldIndicies = [3, 0, 2]; 1112 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 1113 auto outputBuffer = appender!(char[]); 1114 foreach (line; stdin.byLine) 1115 { 1116 fieldReordering.initNewLine; 1117 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 1118 { 1119 fieldReordering.processNextField(fieldIndex, fieldValue); 1120 if (fieldReordering.allFieldsFilled) break; 1121 } 1122 if (fieldReordering.allFieldsFilled) 1123 { 1124 outputBuffer.clear; 1125 writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t'))); 1126 } 1127 else 1128 { 1129 writeln("Error: Insufficient number of field on the line."); 1130 } 1131 } 1132 return 0; 1133 } 1134 --- 1135 */ 1136 OutputRange joinAppend(InputRange, OutputRange, E) 1137 (InputRange inputRange, ref OutputRange outputRange, E delimiter) 1138 if (isInputRange!InputRange && 1139 (is(ElementType!InputRange : const E[]) && 1140 isOutputRange!(OutputRange, E[])) 1141 || 1142 (is(ElementType!InputRange : const E) && 1143 isOutputRange!(OutputRange, E)) 1144 ) 1145 { 1146 if (!inputRange.empty) 1147 { 1148 outputRange.put(inputRange.front); 1149 inputRange.popFront; 1150 } 1151 foreach (x; inputRange) 1152 { 1153 outputRange.put(delimiter); 1154 outputRange.put(x); 1155 } 1156 return outputRange; 1157 } 1158 1159 // joinAppend. 1160 @safe unittest 1161 { 1162 import std.array : appender; 1163 import std.algorithm : equal; 1164 1165 char[] c1 = ['a', 'b', 'c']; 1166 char[] c2 = ['d', 'e', 'f']; 1167 char[] c3 = ['g', 'h', 'i']; 1168 auto cvec = [c1, c2, c3]; 1169 1170 auto s1 = "abc"; 1171 auto s2 = "def"; 1172 auto s3 = "ghi"; 1173 auto svec = [s1, s2, s3]; 1174 1175 auto charAppender = appender!(char[])(); 1176 1177 assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi"); 1178 assert(equal(cvec, [c1, c2, c3])); 1179 1180 charAppender.put('$'); 1181 assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi"); 1182 assert(equal(cvec, [s1, s2, s3])); 1183 1184 charAppender.clear; 1185 assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi"); 1186 1187 auto intAppender = appender!(int[])(); 1188 1189 auto i1 = [100, 101, 102]; 1190 auto i2 = [200, 201, 202]; 1191 auto i3 = [300, 301, 302]; 1192 auto ivec = [i1, i2, i3]; 1193 1194 assert(ivec.joinAppend(intAppender, 0).data == 1195 [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]); 1196 1197 intAppender.clear; 1198 assert(i1.joinAppend(intAppender, 0).data == 1199 [100, 0, 101, 0, 102]); 1200 assert(i2.joinAppend(intAppender, 1).data == 1201 [100, 0, 101, 0, 102, 1202 200, 1, 201, 1, 202]); 1203 assert(i3.joinAppend(intAppender, 2).data == 1204 [100, 0, 101, 0, 102, 1205 200, 1, 201, 1, 202, 1206 300, 2, 301, 2, 302]); 1207 } 1208 1209 /** 1210 getTsvFieldValue extracts the value of a single field from a delimited text string. 1211 1212 This is a convenience function intended for cases when only a single field from an 1213 input line is needed. If multiple values are needed, it will be more efficient to 1214 work directly with std.algorithm.splitter or the InputFieldReordering class. 1215 1216 The input text is split by a delimiter character. The specified field is converted 1217 to the desired type and the value returned. 1218 1219 An exception is thrown if there are not enough fields on the line or if conversion 1220 fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on 1221 failure. If not enough fields, the exception text is generated referencing 1-upped 1222 field numbers as would be provided by command line users. 1223 */ 1224 T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim) 1225 if (isSomeChar!C) 1226 { 1227 import std.algorithm : splitter; 1228 import std.conv : to; 1229 import std.format : format; 1230 import std.range; 1231 1232 auto splitLine = line.splitter(delim); 1233 size_t atField = 0; 1234 1235 while (atField < fieldIndex && !splitLine.empty) 1236 { 1237 splitLine.popFront; 1238 atField++; 1239 } 1240 1241 T val; 1242 if (splitLine.empty) 1243 { 1244 if (fieldIndex == 0) 1245 { 1246 /* This is a workaround to a splitter special case - If the input is empty, 1247 * the returned split range is empty. This doesn't properly represent a single 1248 * column file. More correct mathematically, and for this case, would be a 1249 * single value representing an empty string. The input line is a convenient 1250 * source of an empty line. Info: 1251 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1252 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1253 */ 1254 assert(line.empty); 1255 val = line.to!T; 1256 } 1257 else 1258 { 1259 throw new Exception( 1260 format("Not enough fields on line. Number required: %d; Number found: %d", 1261 fieldIndex + 1, atField)); 1262 } 1263 } 1264 else 1265 { 1266 val = splitLine.front.to!T; 1267 } 1268 1269 return val; 1270 } 1271 1272 // getTsvFieldValue. 1273 @safe unittest 1274 { 1275 import std.conv : ConvException, to; 1276 import std.exception; 1277 1278 /* Common cases. */ 1279 assert(getTsvFieldValue!double("123", 0, '\t') == 123.0); 1280 assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5); 1281 assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123); 1282 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1283 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1284 assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99"); 1285 assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红"); 1286 assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅"); 1287 assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑"); 1288 assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5); 1289 1290 /* The empty field cases. */ 1291 assert(getTsvFieldValue!string("", 0, '\t') == ""); 1292 assert(getTsvFieldValue!string("\t", 0, '\t') == ""); 1293 assert(getTsvFieldValue!string("\t", 1, '\t') == ""); 1294 assert(getTsvFieldValue!string("", 0, ':') == ""); 1295 assert(getTsvFieldValue!string(":", 0, ':') == ""); 1296 assert(getTsvFieldValue!string(":", 1, ':') == ""); 1297 1298 /* Tests with different data types. */ 1299 string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5"; 1300 char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]); 1301 dchar[] dcharLine = stringLine.to!(dchar[]); 1302 wchar[] wcharLine = stringLine.to!(wchar[]); 1303 1304 assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black"); 1305 assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1306 assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1307 assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5); 1308 1309 assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black"); 1310 assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1311 assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1312 assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5); 1313 1314 assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black"); 1315 assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1316 assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1317 assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5); 1318 1319 assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black"); 1320 assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1321 assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1322 assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5); 1323 1324 /* Conversion errors. */ 1325 assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t')); 1326 assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|')); 1327 assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|')); 1328 assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|')); 1329 assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|')); 1330 1331 /* Not enough field errors. These should throw, but not a ConvException.*/ 1332 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t'))); 1333 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t'))); 1334 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t'))); 1335 } 1336 1337 /** [Yes|No.newlineWasRemoved] is a template parameter to throwIfWindowsNewlineOnUnix. 1338 * A Yes value indicates the Unix newline was already removed, as might be done via 1339 * std.File.byLine or similar mechanism. 1340 */ 1341 alias NewlineWasRemoved = Flag!"newlineWasRemoved"; 1342 1343 /** 1344 throwIfWindowsLineNewlineOnUnix is used to throw an exception if a Windows/DOS 1345 line ending is found on a build compiled for a Unix platform. This is used by 1346 the TSV Utilities to detect Window/DOS line endings and terminate processing 1347 with an error message to the user. 1348 */ 1349 void throwIfWindowsNewlineOnUnix 1350 (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved) 1351 (const char[] line, const char[] filename, size_t lineNum) 1352 { 1353 version(Posix) 1354 { 1355 static if (nlWasRemoved) 1356 { 1357 immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r'; 1358 } 1359 else 1360 { 1361 immutable bool hasWindowsLineEnding = 1362 line.length > 1 && 1363 line[$ - 2] == '\r' && 1364 line[$ - 1] == '\n'; 1365 } 1366 1367 if (hasWindowsLineEnding) 1368 { 1369 import std.format; 1370 throw new Exception( 1371 format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n File: %s, Line: %s", 1372 (filename == "-") ? "Standard Input" : filename, lineNum)); 1373 } 1374 } 1375 } 1376 1377 // throwIfWindowsNewlineOnUnix 1378 @safe unittest 1379 { 1380 /* Note: Currently only building on Posix. Need to add non-Posix test cases 1381 * if Windows builds are ever done. 1382 */ 1383 version(Posix) 1384 { 1385 import std.exception; 1386 1387 assertNotThrown(throwIfWindowsNewlineOnUnix("", "afile.tsv", 1)); 1388 assertNotThrown(throwIfWindowsNewlineOnUnix("a", "afile.tsv", 2)); 1389 assertNotThrown(throwIfWindowsNewlineOnUnix("ab", "afile.tsv", 3)); 1390 assertNotThrown(throwIfWindowsNewlineOnUnix("abc", "afile.tsv", 4)); 1391 1392 assertThrown(throwIfWindowsNewlineOnUnix("\r", "afile.tsv", 1)); 1393 assertThrown(throwIfWindowsNewlineOnUnix("a\r", "afile.tsv", 2)); 1394 assertThrown(throwIfWindowsNewlineOnUnix("ab\r", "afile.tsv", 3)); 1395 assertThrown(throwIfWindowsNewlineOnUnix("abc\r", "afile.tsv", 4)); 1396 1397 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\n", "afile.tsv", 1)); 1398 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\n", "afile.tsv", 2)); 1399 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3)); 1400 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4)); 1401 1402 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5)); 1403 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6)); 1404 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7)); 1405 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8)); 1406 1407 /* Standard Input formatting. */ 1408 import std.algorithm : endsWith; 1409 bool exceptionCaught = false; 1410 1411 try (throwIfWindowsNewlineOnUnix("\r", "-", 99)); 1412 catch (Exception e) 1413 { 1414 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1415 exceptionCaught = true; 1416 } 1417 finally 1418 { 1419 assert(exceptionCaught); 1420 exceptionCaught = false; 1421 } 1422 1423 try (throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "-", 99)); 1424 catch (Exception e) 1425 { 1426 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1427 exceptionCaught = true; 1428 } 1429 finally 1430 { 1431 assert(exceptionCaught); 1432 exceptionCaught = false; 1433 } 1434 } 1435 } 1436 1437 /** Flag used by InputSourceRange to determine if the header line should be when 1438 opening a file. 1439 */ 1440 alias ReadHeader = Flag!"readHeader"; 1441 1442 /** 1443 inputSourceRange is a helper function for creating new InputSourceRange objects. 1444 */ 1445 InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader) 1446 { 1447 return new InputSourceRange(filepaths, readHeader); 1448 } 1449 1450 /** 1451 InputSourceRange is an input range that iterates over a set of input files. 1452 1453 InputSourceRange is used to iterate over a set of files passed on the command line. 1454 Files are automatically opened and closed during iteration. The caller can choose to 1455 have header lines read automatically. 1456 1457 The range is created from a set of filepaths. These filepaths are mapped to 1458 InputSource objects during the iteration. This is what enables automatically opening 1459 and closing files and reading the header line. 1460 1461 The motivation for an InputSourceRange is to provide a standard way to look at the 1462 header line of the first input file during command line argument processing, and then 1463 pass the open input file and the header line along to the main processing functions. 1464 This enables a features like named fields to be implemented in a standard way. 1465 1466 Both InputSourceRange and InputSource are reference objects. This keeps their use 1467 limited to a single iteration over the set of files. The files can be iterated again 1468 by creating a new InputSourceRange against the same filepaths. 1469 1470 Currently, InputSourceRange supports files and standard input. It is possible other 1471 types of input sources will be added in the future. 1472 */ 1473 final class InputSourceRange 1474 { 1475 import std.range; 1476 1477 private string[] _filepaths; 1478 private ReadHeader _readHeader; 1479 private InputSource _front; 1480 1481 this(string[] filepaths, ReadHeader readHeader) 1482 { 1483 _filepaths = filepaths.dup; 1484 _readHeader = readHeader; 1485 _front = null; 1486 1487 if (!_filepaths.empty) 1488 { 1489 _front = new InputSource(_filepaths.front, _readHeader); 1490 _front.open; 1491 _filepaths.popFront; 1492 } 1493 } 1494 1495 size_t length() const pure nothrow @safe 1496 { 1497 return empty ? 0 : _filepaths.length + 1; 1498 } 1499 1500 bool empty() const pure nothrow @safe 1501 { 1502 return _front is null; 1503 } 1504 1505 InputSource front() pure @safe 1506 { 1507 assert(!empty, "Attempt to take the front of an empty InputSourceRange"); 1508 return _front; 1509 } 1510 1511 void popFront() 1512 { 1513 assert(!empty, "Attempt to popFront an empty InputSourceRange"); 1514 1515 _front.close; 1516 1517 if (!_filepaths.empty) 1518 { 1519 _front = new InputSource(_filepaths.front, _readHeader); 1520 _front.open; 1521 _filepaths.popFront; 1522 } 1523 else 1524 { 1525 _front = null; 1526 } 1527 } 1528 } 1529 1530 /** 1531 InputSource is a class of objects produced by iterating over an InputSourceRange. 1532 1533 An InputSource object provides access to the open file currently the front element 1534 of an InputSourceRange. The main methods application code is likely to need are: 1535 1536 $(LIST 1537 * `file()` - Returns the File object. The file will be open for reading as long 1538 InputSource instance is the front element of the InputSourceRange it came from. 1539 1540 * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the 1541 header line from the file. An empty string is returned if InputSource range 1542 was created with readHeader=false. 1543 1544 * `name()` - The name of the input source. The name returned is intended for 1545 user error messages. For files, this is the filepath that was passed to 1546 InputSourceRange. For standard input, it is "Standard Input". 1547 ) 1548 1549 An InputSource is a reference object, so the copies will retain the state of the 1550 InputSourceRange front element. In particular, all copies will have the open 1551 state of the front element of the InputSourceRange. 1552 1553 This class is not intended for use outside the context of an InputSourceRange. 1554 */ 1555 final class InputSource 1556 { 1557 import std.range; 1558 import std.stdio; 1559 1560 private immutable string _filepath; 1561 private immutable bool _isStdin; 1562 private bool _isOpen; 1563 private ReadHeader _readHeader; 1564 private bool _hasBeenOpened; 1565 private string _header; 1566 private File _file; 1567 1568 private this(string filepath, ReadHeader readHeader) pure nothrow @safe 1569 { 1570 _filepath = filepath; 1571 _isStdin = filepath == "-"; 1572 _isOpen = false; 1573 _readHeader = readHeader; 1574 _hasBeenOpened = false; 1575 } 1576 1577 /** file returns the File object held by the InputSource. 1578 * 1579 * The File will be open for reading as long as the InputSource instance is the 1580 * front element of the InputSourceRange it came from. 1581 */ 1582 File file() nothrow @safe 1583 { 1584 return _file; 1585 } 1586 1587 /** isReadHeaderEnabled returns true if the header line is being read. 1588 */ 1589 bool isReadHeaderEnabled() const pure nothrow @safe 1590 { 1591 return _readHeader == Yes.readHeader; 1592 } 1593 1594 /** header returns the header line from the input file. 1595 * 1596 * An empty string is returned if InputSource range was created with 1597 * readHeader=false. 1598 */ 1599 string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe 1600 { 1601 assert(_hasBeenOpened); 1602 return (keepTerminator == Yes.keepTerminator || 1603 _header.length == 0 || 1604 _header[$ - 1] != '\n') ? 1605 _header : _header[0 .. $-1]; 1606 } 1607 1608 /** isHeaderEmpty returns true if there is no data for a header, including the 1609 * terminator. 1610 * 1611 * When headers are being read, this true only if the file is empty. 1612 */ 1613 bool isHeaderEmpty() const pure nothrow @safe 1614 { 1615 assert(_hasBeenOpened); 1616 return _header.empty; 1617 } 1618 1619 /** name returns a user friendly name representing the input source. 1620 * 1621 * For files, it is the filepath provided to InputSourceRange. For standard 1622 * input, it is "Standard Input". (Use isStdin() to test for standard input, 1623 * not name(). 1624 */ 1625 string name() const pure nothrow @safe 1626 { 1627 return _isStdin ? "Standard Input" : _filepath; 1628 } 1629 1630 /** isStdin returns true if the input source is Standard Input, false otherwise. 1631 */ 1632 bool isStdin() const pure nothrow @safe 1633 { 1634 return _isStdin; 1635 } 1636 1637 /** isOpen returns true if the input source is open for reading, false otherwise. 1638 * 1639 * "Open" in this context is whether the InputSource object is currently open, 1640 * meaning that it is the front element of the InputSourceRange that created it. 1641 * 1642 * For files, this is also reflected in the state of the underlying File object. 1643 * However, standard input is never actually closed. 1644 */ 1645 bool isOpen() const pure nothrow @safe 1646 { 1647 return _isOpen; 1648 } 1649 1650 private void open() 1651 { 1652 assert(!_isOpen); 1653 assert(!_hasBeenOpened); 1654 1655 _file = isStdin ? stdin : _filepath.File("rb"); 1656 if (_readHeader) _header = _file.readln; 1657 _isOpen = true; 1658 _hasBeenOpened = true; 1659 } 1660 1661 private void close() 1662 { 1663 if (!_isStdin) _file.close; 1664 _isOpen = false; 1665 } 1666 } 1667 1668 // InputSourceRange and InputSource 1669 unittest 1670 { 1671 import std.algorithm : all, each; 1672 import std.array : appender; 1673 import std.exception : assertThrown; 1674 import std.file : rmdirRecurse; 1675 import std.path : buildPath; 1676 import std.range; 1677 import std.stdio; 1678 import tsv_utils.common.unittest_utils; 1679 1680 auto testDir = makeUnittestTempDir("tsv_utils_input_source_range"); 1681 scope(exit) testDir.rmdirRecurse; 1682 1683 string file0 = buildPath(testDir, "file0.txt"); 1684 string file1 = buildPath(testDir, "file1.txt"); 1685 string file2 = buildPath(testDir, "file2.txt"); 1686 string file3 = buildPath(testDir, "file3.txt"); 1687 1688 string file0Header = ""; 1689 string file1Header = "file 1 header\n"; 1690 string file2Header = "file 2 header\n"; 1691 string file3Header = "file 3 header\n"; 1692 1693 string file0Body = ""; 1694 string file1Body = ""; 1695 string file2Body = "file 2 line 1\n"; 1696 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 1697 1698 string file0Data = file0Header ~ file0Body; 1699 string file1Data = file1Header ~ file1Body; 1700 string file2Data = file2Header ~ file2Body; 1701 string file3Data = file3Header ~ file3Body; 1702 1703 { 1704 file0.File("w").write(file0Data); 1705 file1.File("w").write(file1Data); 1706 file2.File("w").write(file2Data); 1707 file3.File("w").write(file3Data); 1708 } 1709 1710 auto inputFiles = [file0, file1, file2, file3]; 1711 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 1712 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 1713 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 1714 1715 auto readSources = appender!(InputSource[]); 1716 auto buffer = new char[1024]; // Must be large enough to hold the test files. 1717 1718 /* Tests without standard input. Don't want to count on state of standard 1719 * input or modifying it when doing unit tests, so avoid reading from it. 1720 */ 1721 1722 foreach(numFiles; 1 .. inputFiles.length + 1) 1723 { 1724 /* Reading headers. */ 1725 1726 readSources.clear; 1727 auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader); 1728 assert(inputSourcesYesHeader.length == numFiles); 1729 1730 foreach(fileNum, source; inputSourcesYesHeader.enumerate) 1731 { 1732 readSources.put(source); 1733 assert(source.isOpen); 1734 assert(source.file.isOpen); 1735 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 1736 assert(readSources.data[fileNum].isOpen); 1737 1738 assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]); 1739 1740 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 1741 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 1742 assert(source.header(No.keepTerminator) == 1743 fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 1744 1745 assert(source.name == inputFiles[fileNum]); 1746 assert(!source.isStdin); 1747 assert(source.isReadHeaderEnabled); 1748 1749 assert(source.file.rawRead(buffer) == fileBodies[fileNum]); 1750 } 1751 1752 /* The InputSourceRange is a reference range, consumed by the foreach. */ 1753 assert(inputSourcesYesHeader.empty); 1754 1755 /* Without reading headers. */ 1756 1757 readSources.clear; 1758 auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader); 1759 assert(inputSourcesNoHeader.length == numFiles); 1760 1761 foreach(fileNum, source; inputSourcesNoHeader.enumerate) 1762 { 1763 readSources.put(source); 1764 assert(source.isOpen); 1765 assert(source.file.isOpen); 1766 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 1767 assert(readSources.data[fileNum].isOpen); 1768 1769 assert(source.header(Yes.keepTerminator).empty); 1770 assert(source.header(No.keepTerminator).empty); 1771 1772 assert(source.name == inputFiles[fileNum]); 1773 assert(!source.isStdin); 1774 assert(!source.isReadHeaderEnabled); 1775 1776 assert(source.file.rawRead(buffer) == fileData[fileNum]); 1777 } 1778 1779 /* The InputSourceRange is a reference range, consumed by the foreach. */ 1780 assert(inputSourcesNoHeader.empty); 1781 } 1782 1783 /* Tests with standard input. No actual reading in these tests. 1784 */ 1785 1786 readSources.clear; 1787 foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate) 1788 { 1789 readSources.put(source); 1790 assert(source.isOpen); 1791 assert(source.file.isOpen); 1792 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); // InputSource objects are "closed". 1793 assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen)); // Actual stdin should not be closed. 1794 assert(readSources.data[fileNum].isOpen); 1795 1796 assert(source.header(Yes.keepTerminator).empty); 1797 assert(source.header(No.keepTerminator).empty); 1798 1799 assert(source.name == "Standard Input"); 1800 assert(source.isStdin); 1801 } 1802 1803 /* Empty filelist. */ 1804 string[] nofiles; 1805 { 1806 auto sources = inputSourceRange(nofiles, No.readHeader); 1807 assert(sources.empty); 1808 } 1809 { 1810 auto sources = inputSourceRange(nofiles, Yes.readHeader); 1811 assert(sources.empty); 1812 } 1813 1814 /* Error cases. */ 1815 assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each); 1816 assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each); 1817 } 1818 1819 /** 1820 byLineSourceRange is a helper function for creating new byLineSourceRange objects. 1821 */ 1822 auto byLineSourceRange( 1823 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 1824 (string[] filepaths) 1825 if (is(Char == char) || is(Char == ubyte)) 1826 { 1827 return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths); 1828 } 1829 1830 /** 1831 ByLineSourceRange is an input range that iterates over a set of input files. It 1832 provides bufferedByLine access to each file. 1833 1834 A ByLineSourceRange is used to iterate over a set of files passed on the command line. 1835 Files are automatically opened and closed during iteration. The front element of the 1836 range provides access to a bufferedByLine for iterating over the lines in the file. 1837 1838 The range is created from a set of filepaths. These filepaths are mapped to 1839 ByLineSource objects during the iteration. This is what enables automatically opening 1840 and closing files and providing bufferedByLine access. 1841 1842 The motivation behind ByLineSourceRange is to provide a standard way to look at the 1843 header line of the first input file during command line argument processing, and then 1844 pass the open input file along to the main processing functions. This enables 1845 features like named fields to be implemented in a standard way. 1846 1847 Access to the first line of the first file is available after creating the 1848 ByLineSourceRange instance. The first file is opened and a bufferedByLine created. 1849 The first line of the first file is via byLine.front (after checking !byLine.empty). 1850 1851 Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use 1852 limited to a single iteration over the set of files. The files can be iterated again 1853 by creating a new InputSourceRange against the same filepaths. 1854 1855 Currently, ByLineSourceRange supports files and standard input. It is possible other 1856 types of input sources will be added in the future. 1857 */ 1858 final class ByLineSourceRange( 1859 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 1860 if (is(Char == char) || is(Char == ubyte)) 1861 { 1862 import std.range; 1863 1864 alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator); 1865 1866 private string[] _filepaths; 1867 private ByLineSourceType _front; 1868 1869 this(string[] filepaths) 1870 { 1871 _filepaths = filepaths.dup; 1872 _front = null; 1873 1874 if (!_filepaths.empty) 1875 { 1876 _front = new ByLineSourceType(_filepaths.front); 1877 _front.open; 1878 _filepaths.popFront; 1879 } 1880 } 1881 1882 size_t length() const pure nothrow @safe 1883 { 1884 return empty ? 0 : _filepaths.length + 1; 1885 } 1886 1887 bool empty() const pure nothrow @safe 1888 { 1889 return _front is null; 1890 } 1891 1892 ByLineSourceType front() pure @safe 1893 { 1894 assert(!empty, "Attempt to take the front of an empty ByLineSourceRange"); 1895 return _front; 1896 } 1897 1898 void popFront() 1899 { 1900 assert(!empty, "Attempt to popFront an empty ByLineSourceRange"); 1901 1902 _front.close; 1903 1904 if (!_filepaths.empty) 1905 { 1906 _front = new ByLineSourceType(_filepaths.front); 1907 _front.open; 1908 _filepaths.popFront; 1909 } 1910 else 1911 { 1912 _front = null; 1913 } 1914 } 1915 } 1916 1917 /** 1918 ByLineSource is a class of objects produced by iterating over an ByLineSourceRange. 1919 1920 A ByLineSource instance provides a bufferedByLine range for the current the front 1921 element of a ByLineSourceRange. The main methods application code is likely to 1922 need are: 1923 1924 $(LIST 1925 * `byLine()` - Returns the bufferedByLine range accessing the open file. The file 1926 will be open for reading (using the bufferedByLine range) as long as the 1927 ByLineSource instance is the front element of the ByLineSourceRange 1928 it came from. 1929 1930 * `name()` - The name of the input source. The name returned is intended for 1931 user error messages. For files, this is the filepath that was passed to 1932 ByLineSourceRange. For standard input, it is "Standard Input". 1933 ) 1934 1935 A ByLineSource is a reference object, so the copies have the same state as the 1936 ByLineSourceRange front element. In particular, all copies will have the open 1937 state of the front element of the ByLineSourceRange. 1938 1939 This class is not intended for use outside the context of an ByLineSourceRange. 1940 */ 1941 final class ByLineSource( 1942 KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n') 1943 if (is(Char == char) || is(Char == ubyte)) 1944 { 1945 import std.range; 1946 import std.stdio; 1947 import std.traits : ReturnType; 1948 1949 alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator); 1950 alias ByLineType = ReturnType!newByLineFn; 1951 1952 private immutable string _filepath; 1953 private immutable bool _isStdin; 1954 private bool _isOpen; 1955 private bool _hasBeenOpened; 1956 private File _file; 1957 private ByLineType _byLineRange; 1958 1959 private this(string filepath) pure nothrow @safe 1960 { 1961 _filepath = filepath; 1962 _isStdin = filepath == "-"; 1963 _isOpen = false; 1964 _hasBeenOpened = false; 1965 } 1966 1967 /** byLine returns the bufferedByLine object held by the ByLineSource instance. 1968 * 1969 * The File underlying the BufferedByLine object is open for reading as long as 1970 * the ByLineSource instance is the front element of the ByLineSourceRange it 1971 * came from. 1972 */ 1973 ByLineType byLine() nothrow @safe 1974 { 1975 return _byLineRange; 1976 } 1977 1978 /** name returns a user friendly name representing the underlying input source. 1979 * 1980 * For files, it is the filepath provided to ByLineSourceRange. For standard 1981 * input, it is "Standard Input". (Use isStdin() to test for standard input, 1982 * compare against name().) 1983 */ 1984 string name() const pure nothrow @safe 1985 { 1986 return _isStdin ? "Standard Input" : _filepath; 1987 } 1988 1989 /** isStdin returns true if the underlying input source is Standard Input, false 1990 * otherwise. 1991 */ 1992 bool isStdin() const pure nothrow @safe 1993 { 1994 return _isStdin; 1995 } 1996 1997 /** isOpen returns true if the ByLineSource instance is open for reading, false 1998 * otherwise. 1999 * 2000 * "Open" in this context is whether the ByLineSource object is currently "open". 2001 * The underlying input source backing it does not necessarily have the same 2002 * state. The ByLineSource instance is "open" if is the front element of the 2003 * ByLineSourceRange that created it. 2004 * 2005 * The underlying input source object follows the same open/close state as makes 2006 * sense. In particular, real files are closed when the ByLineSource object is 2007 * closed. The exception is standard input, which is never actually closed. 2008 */ 2009 bool isOpen() const pure nothrow @safe 2010 { 2011 return _isOpen; 2012 } 2013 2014 private void open() 2015 { 2016 assert(!_isOpen); 2017 assert(!_hasBeenOpened); 2018 2019 _file = isStdin ? stdin : _filepath.File("rb"); 2020 _byLineRange = newByLineFn(_file); 2021 _isOpen = true; 2022 _hasBeenOpened = true; 2023 } 2024 2025 private void close() 2026 { 2027 if (!_isStdin) _file.close; 2028 _isOpen = false; 2029 } 2030 } 2031 2032 // ByLineSourceRange and ByLineSource 2033 unittest 2034 { 2035 import std.algorithm : all, each; 2036 import std.array : appender; 2037 import std.exception : assertThrown; 2038 import std.file : rmdirRecurse; 2039 import std.path : buildPath; 2040 import std.range; 2041 import std.stdio; 2042 import tsv_utils.common.unittest_utils; 2043 2044 auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range"); 2045 scope(exit) testDir.rmdirRecurse; 2046 2047 string file0 = buildPath(testDir, "file0.txt"); 2048 string file1 = buildPath(testDir, "file1.txt"); 2049 string file2 = buildPath(testDir, "file2.txt"); 2050 string file3 = buildPath(testDir, "file3.txt"); 2051 2052 string file0Header = ""; 2053 string file1Header = "file 1 header\n"; 2054 string file2Header = "file 2 header\n"; 2055 string file3Header = "file 3 header\n"; 2056 2057 string file0Body = ""; 2058 string file1Body = ""; 2059 string file2Body = "file 2 line 1\n"; 2060 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 2061 2062 string file0Data = file0Header ~ file0Body; 2063 string file1Data = file1Header ~ file1Body; 2064 string file2Data = file2Header ~ file2Body; 2065 string file3Data = file3Header ~ file3Body; 2066 2067 { 2068 file0.File("w").write(file0Data); 2069 file1.File("w").write(file1Data); 2070 file2.File("w").write(file2Data); 2071 file3.File("w").write(file3Data); 2072 } 2073 2074 auto inputFiles = [file0, file1, file2, file3]; 2075 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 2076 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 2077 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 2078 2079 auto buffer = new char[1024]; // Must be large enough to hold the test files. 2080 2081 /* Tests without standard input. Don't want to count on state of standard 2082 * input or modifying it when doing unit tests, so avoid reading from it. 2083 */ 2084 2085 auto readSourcesNoTerminator = appender!(ByLineSource!(No.keepTerminator)[]); 2086 auto readSourcesYesTerminator = appender!(ByLineSource!(Yes.keepTerminator)[]); 2087 2088 foreach(numFiles; 1 .. inputFiles.length + 1) 2089 { 2090 /* Using No.keepTerminator. */ 2091 readSourcesNoTerminator.clear; 2092 auto inputSourcesNoTerminator = byLineSourceRange!(No.keepTerminator)(inputFiles[0 .. numFiles]); 2093 assert(inputSourcesNoTerminator.length == numFiles); 2094 2095 foreach(fileNum, source; inputSourcesNoTerminator.enumerate) 2096 { 2097 readSourcesNoTerminator.put(source); 2098 assert(source.isOpen); 2099 assert(source._file.isOpen); 2100 assert(readSourcesNoTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2101 assert(readSourcesNoTerminator.data[fileNum].isOpen); 2102 2103 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 2104 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 2105 2106 assert(source.byLine.empty || 2107 source.byLine.front == fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 2108 2109 assert(source.name == inputFiles[fileNum]); 2110 assert(!source.isStdin); 2111 2112 auto readFileData = appender!(char[]); 2113 foreach(line; source.byLine) 2114 { 2115 readFileData.put(line); 2116 readFileData.put('\n'); 2117 } 2118 2119 assert(readFileData.data == fileData[fileNum]); 2120 } 2121 2122 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2123 assert(inputSourcesNoTerminator.empty); 2124 2125 /* Using Yes.keepTerminator. */ 2126 readSourcesYesTerminator.clear; 2127 auto inputSourcesYesTerminator = byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles]); 2128 assert(inputSourcesYesTerminator.length == numFiles); 2129 2130 foreach(fileNum, source; inputSourcesYesTerminator.enumerate) 2131 { 2132 readSourcesYesTerminator.put(source); 2133 assert(source.isOpen); 2134 assert(source._file.isOpen); 2135 assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2136 assert(readSourcesYesTerminator.data[fileNum].isOpen); 2137 2138 assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]); 2139 2140 assert(source.name == inputFiles[fileNum]); 2141 assert(!source.isStdin); 2142 2143 auto readFileData = appender!(char[]); 2144 foreach(line; source.byLine) 2145 { 2146 readFileData.put(line); 2147 } 2148 2149 assert(readFileData.data == fileData[fileNum]); 2150 } 2151 2152 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2153 assert(inputSourcesYesTerminator.empty); 2154 } 2155 2156 /* Empty filelist. */ 2157 string[] nofiles; 2158 { 2159 auto sources = byLineSourceRange!(No.keepTerminator)(nofiles); 2160 assert(sources.empty); 2161 } 2162 { 2163 auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles); 2164 assert(sources.empty); 2165 } 2166 2167 /* Error cases. */ 2168 assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each); 2169 assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each); 2170 }