1 /** 2 Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange, 3 and a several others. 4 5 Utilities in this file: 6 $(LIST 7 * [InputFieldReordering] - A class that creates a reordered subset of fields from 8 an input line. Fields in the subset are accessed by array indicies. This is 9 especially useful when processing the subset in a specific order, such as the 10 order listed on the command-line at run-time. 11 12 * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer 13 output. Intended for use with stdout, it is a significant performance benefit. 14 15 * [isFlushableOutputRange] - Tests if something is an OutputRange with a flush 16 member. 17 18 * [bufferedByLine] - An input range that reads from a File handle line by line. 19 It is similar to the standard library method std.stdio.File.byLine, but quite a 20 bit faster. This is achieved by reading in larger blocks and buffering. 21 22 * [InputSourceRange] - An input range that provides open file access to a set of 23 files. It is used to iterate over files passed as command line arguments. This 24 enable reading header line of a file during command line argument process, then 25 passing the open file to the main processing functions. 26 27 * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides 28 access to a byLine iterator (bufferedByLine) rather than an open file. This is 29 used by tools that run the same processing logic both header non-header lines. 30 31 * [joinAppend] - A function that performs a join, but appending the join output to 32 an output stream. It is a performance improvement over using join or joiner with 33 writeln. 34 35 * [getTsvFieldValue] - A convenience function when only a single value is needed 36 from an input line. 37 38 * [throwIfWindowsNewlineOnUnix] - A utility for Unix platform builds to detecting 39 Windows newlines in input. 40 ) 41 42 Copyright (c) 2015-2020, eBay Inc. 43 Initially written by Jon Degenhardt 44 45 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 46 */ 47 48 module tsv_utils.common.utils; 49 50 import std.range; 51 import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned, ReturnType; 52 import std.typecons : Flag, No, Yes; 53 54 // InputFieldReording class. 55 56 /** Flag used by the InputFieldReordering template. */ 57 alias EnablePartialLines = Flag!"enablePartialLines"; 58 59 /** 60 InputFieldReordering - Move select fields from an input line to an output array, 61 reordering along the way. 62 63 The InputFieldReordering class is used to reorder a subset of fields from an input line. 64 The caller instantiates an InputFieldReordering object at the start of input processing. 65 The instance contains a mapping from input index to output index, plus a buffer holding 66 the reordered fields. The caller processes each input line by calling initNewLine, 67 splitting the line into fields, and calling processNextField on each field. The output 68 buffer is ready when the allFieldsFilled method returns true. 69 70 Fields are not copied, instead the output buffer points to the fields passed by the caller. 71 The caller needs to use or copy the output buffer while the fields are still valid, which 72 is normally until reading the next input line. The program below illustrates the basic use 73 case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend, 74 below, which has a performance improvement over join used here.) 75 76 --- 77 int main(string[] args) 78 { 79 import tsv_utils.common.utils; 80 import std.algorithm, std.array, std.range, std.stdio; 81 size_t[] fieldIndicies = [3, 0, 2]; 82 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 83 foreach (line; stdin.byLine) 84 { 85 fieldReordering.initNewLine; 86 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 87 { 88 fieldReordering.processNextField(fieldIndex, fieldValue); 89 if (fieldReordering.allFieldsFilled) break; 90 } 91 if (fieldReordering.allFieldsFilled) 92 { 93 writeln(fieldReordering.outputFields.join('\t')); 94 } 95 else 96 { 97 writeln("Error: Insufficient number of field on the line."); 98 } 99 } 100 return 0; 101 } 102 --- 103 104 Field indicies are zero-based. An individual field can be listed multiple times. The 105 outputFields array is not valid until all the specified fields have been processed. The 106 allFieldsFilled method tests this. If a line does not have enough fields the outputFields 107 buffer cannot be used. For most TSV applications this is okay, as it means the line is 108 invalid and cannot be used. However, if partial lines are okay, the template can be 109 instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in 110 are empty strings in the outputFields return. 111 */ 112 final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no) 113 if (isSomeChar!C) 114 { 115 /* Implementation: The class works by creating an array of tuples mapping the input 116 * field index to the location in the outputFields array. The 'fromToMap' array is 117 * sorted in input field order, enabling placement in the outputFields buffer during a 118 * pass over the input fields. The map is created by the constructor. An example: 119 * 120 * inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9] 121 * fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>] 122 * 123 * During processing of an a line, an array slice, mapStack, is used to track how 124 * much of the fromToMap remains to be processed. 125 */ 126 import std.range; 127 import std.typecons : Tuple; 128 129 alias TupleFromTo = Tuple!(size_t, "from", size_t, "to"); 130 131 private C[][] outputFieldsBuf; 132 private TupleFromTo[] fromToMap; 133 private TupleFromTo[] mapStack; 134 135 final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe 136 { 137 import std.algorithm : sort; 138 139 outputFieldsBuf = new C[][](inputFieldIndicies.length); 140 fromToMap.reserve(inputFieldIndicies.length); 141 142 foreach (to, from; inputFieldIndicies.enumerate(start)) 143 { 144 fromToMap ~= TupleFromTo(from, to); 145 } 146 147 sort(fromToMap); 148 initNewLine; 149 } 150 151 /** initNewLine initializes the object for a new line. */ 152 final void initNewLine() pure nothrow @safe 153 { 154 mapStack = fromToMap; 155 static if (partialLinesOk) 156 { 157 import std.algorithm : each; 158 outputFieldsBuf.each!((ref s) => s.length = 0); 159 } 160 } 161 162 /** processNextField maps an input field to the correct locations in the 163 * outputFields array. 164 * 165 * processNextField should be called once for each field on the line, in the order 166 * found. The processing of the line can terminate once allFieldsFilled returns 167 * true. 168 * 169 * The return value is the number of output fields the input field maps to. Zero 170 * means the field is not mapped to the output fields array. 171 * 172 * If, prior to allFieldsProcessed returning true, any fields on the input line 173 * are not passed to processNextField, the caller should either ensure the fields 174 * are not part of the output fields or have partial lines enabled. 175 */ 176 final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc 177 { 178 size_t numFilled = 0; 179 while (!mapStack.empty && fieldIndex == mapStack.front.from) 180 { 181 outputFieldsBuf[mapStack.front.to] = fieldValue; 182 mapStack.popFront; 183 numFilled++; 184 } 185 return numFilled; 186 } 187 188 /** allFieldsFilled returned true if all fields expected have been processed. */ 189 final bool allFieldsFilled() const pure nothrow @safe @nogc 190 { 191 return mapStack.empty; 192 } 193 194 /** outputFields is the assembled output fields. Unless partial lines are enabled, 195 * it is only valid after allFieldsFilled is true. 196 */ 197 final C[][] outputFields() pure nothrow @safe @nogc 198 { 199 return outputFieldsBuf[]; 200 } 201 } 202 203 // InputFieldReordering - Tests using different character types. 204 @safe unittest 205 { 206 import std.conv : to; 207 208 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 209 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 210 ["r3f0", "123", "456", "789"]]; 211 212 size_t[] fields_2_0 = [2, 0]; 213 214 auto expected_2_0 = [["r1f2", "r1f0"], 215 ["ÀBCßßZ", "r2f0"], 216 ["456", "r3f0"]]; 217 218 char[][][] charExpected_2_0 = to!(char[][][])(expected_2_0); 219 wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0); 220 dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0); 221 dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0); 222 223 auto charIFR = new InputFieldReordering!char(fields_2_0); 224 auto wcharIFR = new InputFieldReordering!wchar(fields_2_0); 225 auto dcharIFR = new InputFieldReordering!dchar(fields_2_0); 226 227 foreach (lineIndex, line; inputLines) 228 { 229 charIFR.initNewLine; 230 wcharIFR.initNewLine; 231 dcharIFR.initNewLine; 232 233 foreach (fieldIndex, fieldValue; line) 234 { 235 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 236 wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue)); 237 dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue)); 238 239 assert ((fieldIndex >= 2) == charIFR.allFieldsFilled); 240 assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled); 241 assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled); 242 } 243 assert(charIFR.allFieldsFilled); 244 assert(wcharIFR.allFieldsFilled); 245 assert(dcharIFR.allFieldsFilled); 246 247 assert(charIFR.outputFields == charExpected_2_0[lineIndex]); 248 assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]); 249 assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]); 250 } 251 } 252 253 // InputFieldReordering - Test of partial line support. 254 @safe unittest 255 { 256 import std.conv : to; 257 258 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 259 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 260 ["r3f0", "123", "456", "789"]]; 261 262 size_t[] fields_2_0 = [2, 0]; 263 264 // The expected states of the output field while each line and field are processed. 265 auto expectedBylineByfield_2_0 = 266 [ 267 [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"], ["r1f2", "r1f0"]], 268 [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]], 269 [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"], ["456", "r3f0"]], 270 ]; 271 272 char[][][][] charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0); 273 274 auto charIFR = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0); 275 276 foreach (lineIndex, line; inputLines) 277 { 278 charIFR.initNewLine; 279 foreach (fieldIndex, fieldValue; line) 280 { 281 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 282 assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]); 283 } 284 } 285 } 286 287 // InputFieldReordering - Field combination tests. 288 @safe unittest 289 { 290 import std.conv : to; 291 import std.stdio; 292 293 auto inputLines = [["00", "01", "02", "03"], 294 ["10", "11", "12", "13"], 295 ["20", "21", "22", "23"]]; 296 297 size_t[] fields_0 = [0]; 298 size_t[] fields_3 = [3]; 299 size_t[] fields_01 = [0, 1]; 300 size_t[] fields_10 = [1, 0]; 301 size_t[] fields_03 = [0, 3]; 302 size_t[] fields_30 = [3, 0]; 303 size_t[] fields_0123 = [0, 1, 2, 3]; 304 size_t[] fields_3210 = [3, 2, 1, 0]; 305 size_t[] fields_03001 = [0, 3, 0, 0, 1]; 306 307 auto expected_0 = to!(char[][][])([["00"], 308 ["10"], 309 ["20"]]); 310 311 auto expected_3 = to!(char[][][])([["03"], 312 ["13"], 313 ["23"]]); 314 315 auto expected_01 = to!(char[][][])([["00", "01"], 316 ["10", "11"], 317 ["20", "21"]]); 318 319 auto expected_10 = to!(char[][][])([["01", "00"], 320 ["11", "10"], 321 ["21", "20"]]); 322 323 auto expected_03 = to!(char[][][])([["00", "03"], 324 ["10", "13"], 325 ["20", "23"]]); 326 327 auto expected_30 = to!(char[][][])([["03", "00"], 328 ["13", "10"], 329 ["23", "20"]]); 330 331 auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"], 332 ["10", "11", "12", "13"], 333 ["20", "21", "22", "23"]]); 334 335 auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"], 336 ["13", "12", "11", "10"], 337 ["23", "22", "21", "20"]]); 338 339 auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"], 340 ["10", "13", "10", "10", "11"], 341 ["20", "23", "20", "20", "21"]]); 342 343 auto ifr_0 = new InputFieldReordering!char(fields_0); 344 auto ifr_3 = new InputFieldReordering!char(fields_3); 345 auto ifr_01 = new InputFieldReordering!char(fields_01); 346 auto ifr_10 = new InputFieldReordering!char(fields_10); 347 auto ifr_03 = new InputFieldReordering!char(fields_03); 348 auto ifr_30 = new InputFieldReordering!char(fields_30); 349 auto ifr_0123 = new InputFieldReordering!char(fields_0123); 350 auto ifr_3210 = new InputFieldReordering!char(fields_3210); 351 auto ifr_03001 = new InputFieldReordering!char(fields_03001); 352 353 foreach (lineIndex, line; inputLines) 354 { 355 ifr_0.initNewLine; 356 ifr_3.initNewLine; 357 ifr_01.initNewLine; 358 ifr_10.initNewLine; 359 ifr_03.initNewLine; 360 ifr_30.initNewLine; 361 ifr_0123.initNewLine; 362 ifr_3210.initNewLine; 363 ifr_03001.initNewLine; 364 365 foreach (fieldIndex, fieldValue; line) 366 { 367 ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue)); 368 ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue)); 369 ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue)); 370 ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue)); 371 ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue)); 372 ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue)); 373 ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue)); 374 ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue)); 375 ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue)); 376 } 377 378 assert(ifr_0.outputFields == expected_0[lineIndex]); 379 assert(ifr_3.outputFields == expected_3[lineIndex]); 380 assert(ifr_01.outputFields == expected_01[lineIndex]); 381 assert(ifr_10.outputFields == expected_10[lineIndex]); 382 assert(ifr_03.outputFields == expected_03[lineIndex]); 383 assert(ifr_30.outputFields == expected_30[lineIndex]); 384 assert(ifr_0123.outputFields == expected_0123[lineIndex]); 385 assert(ifr_3210.outputFields == expected_3210[lineIndex]); 386 assert(ifr_03001.outputFields == expected_03001[lineIndex]); 387 } 388 } 389 390 391 import std.stdio : File, isFileHandle, KeepTerminator; 392 import std.range : isOutputRange; 393 import std.traits : Unqual; 394 395 /** 396 BufferedOutputRange is a performance enhancement over writing directly to an output 397 stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an 398 internal buffer and written to the output stream as a block. 399 400 Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster 401 than writing to stdout directly. This is especially noticable for outputs with short 402 lines, as it blocks many writes together in a single write. 403 404 The internal buffer is written to the output stream after flushSize has been reached. 405 This is checked at newline boundaries, when appendln is called or when put is called 406 with a single newline character. Other writes check maxSize, which is used to avoid 407 runaway buffers. 408 409 BufferedOutputRange has a put method allowing it to be used a range. It has a number 410 of other methods providing additional control. 411 412 $(LIST 413 * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the 414 output stream, e.g. stdout. Other arguments are optional, defaults normally suffice. 415 416 * `append(stuff)` - Append to the internal buffer. 417 418 * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer 419 is flushed to the output stream if is has reached flushSize. 420 421 * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the 422 output stream if is has reached flushSize. 423 424 * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`. 425 For reasons that are not clear, joiner is quite slow. 426 427 * `flushIfFull()` - Flush the internal buffer to the output stream if flushSize has been 428 reached. 429 430 * `flush()` - Write the internal buffer to the output stream. 431 432 * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single 433 newline character, '\n' or "\n". 434 ) 435 436 The internal buffer is automatically flushed when the BufferedOutputRange goes out of 437 scope. 438 */ 439 struct BufferedOutputRange(OutputTarget) 440 if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char)) 441 { 442 import std.range : isOutputRange; 443 import std.array : appender; 444 import std.format : format; 445 446 /* Identify the output element type. Only supporting char and ubyte for now. */ 447 static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char)) 448 { 449 alias C = char; 450 } 451 else static if (isOutputRange!(OutputTarget, ubyte)) 452 { 453 alias C = ubyte; 454 } 455 else static assert(false); 456 457 private enum defaultReserveSize = 11264; 458 private enum defaultFlushSize = 10240; 459 private enum defaultMaxSize = 4194304; 460 461 private OutputTarget _outputTarget; 462 private auto _outputBuffer = appender!(C[]); 463 private immutable size_t _flushSize; 464 private immutable size_t _maxSize; 465 466 this(OutputTarget outputTarget, 467 size_t flushSize = defaultFlushSize, 468 size_t reserveSize = defaultReserveSize, 469 size_t maxSize = defaultMaxSize) 470 { 471 assert(flushSize <= maxSize); 472 473 _outputTarget = outputTarget; 474 _flushSize = flushSize; 475 _maxSize = (flushSize <= maxSize) ? maxSize : flushSize; 476 _outputBuffer.reserve(reserveSize); 477 } 478 479 ~this() 480 { 481 flush(); 482 } 483 484 void flush() 485 { 486 static if (isFileHandle!OutputTarget) _outputTarget.write(_outputBuffer.data); 487 else _outputTarget.put(_outputBuffer.data); 488 489 _outputBuffer.clear; 490 } 491 492 bool flushIfFull() 493 { 494 bool isFull = _outputBuffer.data.length >= _flushSize; 495 if (isFull) flush(); 496 return isFull; 497 } 498 499 /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */ 500 void flushIfMaxSize() 501 { 502 if (_outputBuffer.data.length >= _maxSize) flush(); 503 } 504 505 /* maybeFlush is intended for the case where put is called with a trailing newline. 506 * 507 * Flushing occurs if the buffer has a trailing newline and has reached flush size. 508 * Flushing also occurs if the buffer has reached max size. 509 */ 510 private bool maybeFlush() 511 { 512 immutable bool doFlush = 513 _outputBuffer.data.length >= _flushSize && 514 (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize); 515 516 if (doFlush) flush(); 517 return doFlush; 518 } 519 520 521 private void appendRaw(T)(T stuff) pure 522 { 523 import std.range : rangePut = put; 524 rangePut(_outputBuffer, stuff); 525 } 526 527 void append(T)(T stuff) 528 { 529 appendRaw(stuff); 530 maybeFlush(); 531 } 532 533 bool appendln() 534 { 535 appendRaw('\n'); 536 return flushIfFull(); 537 } 538 539 bool appendln(T)(T stuff) 540 { 541 appendRaw(stuff); 542 return appendln(); 543 } 544 545 /* joinAppend is an optimization of append(inputRange.joiner(delimiter). 546 * This form is quite a bit faster, 40%+ on some benchmarks. 547 */ 548 void joinAppend(InputRange, E)(InputRange inputRange, E delimiter) 549 if (isInputRange!InputRange && 550 is(ElementType!InputRange : const C[]) && 551 (is(E : const C[]) || is(E : const C))) 552 { 553 if (!inputRange.empty) 554 { 555 appendRaw(inputRange.front); 556 inputRange.popFront; 557 } 558 foreach (x; inputRange) 559 { 560 appendRaw(delimiter); 561 appendRaw(x); 562 } 563 flushIfMaxSize(); 564 } 565 566 /* Make this an output range. */ 567 void put(T)(T stuff) 568 { 569 import std.traits; 570 import std.stdio; 571 572 static if (isSomeChar!T) 573 { 574 if (stuff == '\n') appendln(); 575 else appendRaw(stuff); 576 } 577 else static if (isSomeString!T) 578 { 579 if (stuff == "\n") appendln(); 580 else append(stuff); 581 } 582 else append(stuff); 583 } 584 } 585 586 // BufferedOutputRange. 587 unittest 588 { 589 import tsv_utils.common.unittest_utils; 590 import std.file : rmdirRecurse, readText; 591 import std.path : buildPath; 592 593 auto testDir = makeUnittestTempDir("tsv_utils_buffered_output"); 594 scope(exit) testDir.rmdirRecurse; 595 596 import std.algorithm : map, joiner; 597 import std.range : iota; 598 import std.conv : to; 599 600 /* Basic test. Note that exiting the scope triggers flush. */ 601 string filepath1 = buildPath(testDir, "file1.txt"); 602 { 603 import std.stdio : File; 604 605 auto ostream = BufferedOutputRange!File(filepath1.File("w")); 606 ostream.append("file1: "); 607 ostream.append("abc"); 608 ostream.append(["def", "ghi", "jkl"]); 609 ostream.appendln(100.to!string); 610 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 611 ostream.appendln(); 612 } 613 assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 614 615 /* Test with no reserve and no flush at every line. */ 616 string filepath2 = buildPath(testDir, "file2.txt"); 617 { 618 import std.stdio : File; 619 620 auto ostream = BufferedOutputRange!File(filepath2.File("w"), 0, 0); 621 ostream.append("file2: "); 622 ostream.append("abc"); 623 ostream.append(["def", "ghi", "jkl"]); 624 ostream.appendln("100"); 625 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 626 ostream.appendln(); 627 } 628 assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 629 630 /* With a locking text writer. Requires version 2.078.0 631 See: https://issues.dlang.org/show_bug.cgi?id=9661 632 */ 633 static if (__VERSION__ >= 2078) 634 { 635 string filepath3 = buildPath(testDir, "file3.txt"); 636 { 637 import std.stdio : File; 638 639 auto ltw = filepath3.File("w").lockingTextWriter; 640 { 641 auto ostream = BufferedOutputRange!(typeof(ltw))(ltw); 642 ostream.append("file3: "); 643 ostream.append("abc"); 644 ostream.append(["def", "ghi", "jkl"]); 645 ostream.appendln("100"); 646 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 647 ostream.appendln(); 648 } 649 } 650 assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 651 } 652 653 /* With an Appender. */ 654 import std.array : appender; 655 auto app1 = appender!(char[]); 656 { 657 auto ostream = BufferedOutputRange!(typeof(app1))(app1); 658 ostream.append("appender1: "); 659 ostream.append("abc"); 660 ostream.append(["def", "ghi", "jkl"]); 661 ostream.appendln("100"); 662 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 663 ostream.appendln(); 664 } 665 assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 666 667 /* With an Appender, but checking flush boundaries. */ 668 auto app2 = appender!(char[]); 669 { 670 auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+ 671 bool wasFlushed = false; 672 673 assert(app2.data == ""); 674 675 ostream.append("12345678"); // Not flushed yet. 676 assert(app2.data == ""); 677 678 wasFlushed = ostream.appendln; // Nineth char, not flushed yet. 679 assert(!wasFlushed); 680 assert(app2.data == ""); 681 682 wasFlushed = ostream.appendln; // Tenth char, now flushed. 683 assert(wasFlushed); 684 assert(app2.data == "12345678\n\n"); 685 686 app2.clear; 687 assert(app2.data == ""); 688 689 ostream.append("12345678"); 690 691 wasFlushed = ostream.flushIfFull; 692 assert(!wasFlushed); 693 assert(app2.data == ""); 694 695 ostream.flush; 696 assert(app2.data == "12345678"); 697 698 app2.clear; 699 assert(app2.data == ""); 700 701 ostream.append("123456789012345"); 702 assert(app2.data == ""); 703 } 704 assert(app2.data == "123456789012345"); 705 706 /* Using joinAppend. */ 707 auto app1b = appender!(char[]); 708 { 709 auto ostream = BufferedOutputRange!(typeof(app1b))(app1b); 710 ostream.append("appenderB: "); 711 ostream.joinAppend(["a", "bc", "def"], '-'); 712 ostream.append(':'); 713 ostream.joinAppend(["g", "hi", "jkl"], '-'); 714 ostream.appendln("*100*"); 715 ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' '); 716 ostream.append(' '); 717 ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " "); 718 ostream.appendln(); 719 } 720 assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n", 721 "app1b.data: |" ~app1b.data ~ "|"); 722 723 /* Operating as an output range. When passed to a function as a ref, exiting 724 * the function does not flush. When passed as a value, it get flushed when 725 * the function returns. Also test both UCFS and non-UFCS styles. 726 */ 727 728 void outputStuffAsRef(T)(ref T range) 729 if (isOutputRange!(T, char)) 730 { 731 range.put('1'); 732 put(range, "23"); 733 range.put('\n'); 734 range.put(["5", "67"]); 735 put(range, iota(8, 10).map!(x => x.to!string)); 736 put(range, "\n"); 737 } 738 739 void outputStuffAsVal(T)(T range) 740 if (isOutputRange!(T, char)) 741 { 742 put(range, '1'); 743 range.put("23"); 744 put(range, '\n'); 745 put(range, ["5", "67"]); 746 range.put(iota(8, 10).map!(x => x.to!string)); 747 range.put("\n"); 748 } 749 750 auto app3 = appender!(char[]); 751 { 752 auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0); 753 outputStuffAsRef(ostream); 754 assert(app3.data == "", "app3.data: |" ~app3.data ~ "|"); 755 outputStuffAsRef(ostream); 756 assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|"); 757 } 758 assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|"); 759 760 auto app4 = appender!(char[]); 761 { 762 auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0); 763 outputStuffAsVal(ostream); 764 assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 765 outputStuffAsVal(ostream); 766 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 767 } 768 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 769 770 /* Test maxSize. */ 771 auto app5 = appender!(char[]); 772 { 773 auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10 774 assert(app5.data == ""); 775 776 ostream.append("1234567"); // Not flushed yet (no newline). 777 assert(app5.data == ""); 778 779 ostream.append("89012"); // Flushed by maxSize 780 assert(app5.data == "123456789012"); 781 782 ostream.put("1234567"); // Not flushed yet (no newline). 783 assert(app5.data == "123456789012"); 784 785 ostream.put("89012"); // Flushed by maxSize 786 assert(app5.data == "123456789012123456789012"); 787 788 ostream.joinAppend(["ab", "cd"], '-'); // Not flushed yet 789 ostream.joinAppend(["de", "gh", "ij"], '-'); // Flushed by maxSize 790 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 791 } 792 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 793 } 794 795 /** 796 isFlushableOutputRange returns true if R is an output range with a flush member. 797 */ 798 enum bool isFlushableOutputRange(R, E=char) = isOutputRange!(R, E) 799 && is(ReturnType!((R r) => r.flush) == void); 800 801 @safe unittest 802 { 803 import std.array; 804 auto app = appender!(char[]); 805 auto ostream = BufferedOutputRange!(typeof(app))(app, 5, 0, 10); // maxSize 10 806 807 static assert(isOutputRange!(typeof(app), char)); 808 static assert(!isFlushableOutputRange!(typeof(app), char)); 809 static assert(!isFlushableOutputRange!(typeof(app))); 810 811 static assert(isOutputRange!(typeof(ostream), char)); 812 static assert(isFlushableOutputRange!(typeof(ostream), char)); 813 static assert(isFlushableOutputRange!(typeof(ostream))); 814 815 static assert(isOutputRange!(Appender!string, string)); 816 static assert(!isFlushableOutputRange!(Appender!string, string)); 817 static assert(!isFlushableOutputRange!(Appender!string)); 818 819 static assert(isOutputRange!(Appender!(char[]), char)); 820 static assert(!isFlushableOutputRange!(Appender!(char[]), char)); 821 static assert(!isFlushableOutputRange!(Appender!(char[]))); 822 823 static assert(isOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 824 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])))); 825 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 826 } 827 828 829 /** 830 bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by 831 reading a large buffer from the input stream rather than just a single line. 832 833 The file argument needs to be a File object open for reading, typically a filesystem 834 file or standard input. Use the Yes.keepTerminator template parameter to keep the 835 newline. This is similar to stdio.File.byLine, except specified as a template paramter 836 rather than a runtime parameter. 837 838 Reading in blocks does mean that input is not read until a full buffer is available or 839 end-of-file is reached. For this reason, bufferedByLine is not appropriate for 840 interactive input. 841 */ 842 843 auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char, 844 ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16) 845 (File file) 846 if (is(Char == char) || is(Char == ubyte)) 847 { 848 static assert(0 < growSize && growSize <= readSize); 849 850 static final class BufferedByLineImpl 851 { 852 /* Buffer state variables 853 * - _buffer.length - Full length of allocated buffer. 854 * - _dataEnd - End of currently valid data (end of last read). 855 * - _lineStart - Start of current line. 856 * - _lineEnd - End of current line. 857 */ 858 private File _file; 859 private ubyte[] _buffer; 860 private size_t _lineStart = 0; 861 private size_t _lineEnd = 0; 862 private size_t _dataEnd = 0; 863 864 this (File f) 865 { 866 _file = f; 867 _buffer = new ubyte[readSize + growSize]; 868 } 869 870 bool empty() const pure 871 { 872 return _file.eof && _lineStart == _dataEnd; 873 } 874 875 Char[] front() pure 876 { 877 assert(!empty, "Attempt to take the front of an empty bufferedByLine."); 878 879 static if (keepTerminator == Yes.keepTerminator) 880 { 881 return cast(Char[]) _buffer[_lineStart .. _lineEnd]; 882 } 883 else 884 { 885 assert(_lineStart < _lineEnd); 886 immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd; 887 return cast(Char[]) _buffer[_lineStart .. end]; 888 } 889 } 890 891 /* Note: Call popFront at initialization to do the initial read. */ 892 void popFront() 893 { 894 import std.algorithm: copy, find; 895 assert(!empty, "Attempt to popFront an empty bufferedByLine."); 896 897 /* Pop the current line. */ 898 _lineStart = _lineEnd; 899 900 /* Set up the next line if more data is available, either in the buffer or 901 * the file. The next line ends at the next newline, if there is one. 902 * 903 * Notes: 904 * - 'find' returns the slice starting with the character searched for, or 905 * an empty range if not found. 906 * - _lineEnd is set to _dataEnd both when the current buffer does not have 907 * a newline and when it ends with one. 908 */ 909 auto found = _buffer[_lineStart .. _dataEnd].find(terminator); 910 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 911 912 if (found.empty && !_file.eof) 913 { 914 /* No newline in current buffer. Read from the file until the next 915 * newline is found. 916 */ 917 assert(_lineEnd == _dataEnd); 918 919 if (_lineStart > 0) 920 { 921 /* Move remaining data to the start of the buffer. */ 922 immutable remainingLength = _dataEnd - _lineStart; 923 copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]); 924 _lineStart = 0; 925 _lineEnd = _dataEnd = remainingLength; 926 } 927 928 do 929 { 930 /* Grow the buffer if necessary. */ 931 immutable availableSize = _buffer.length - _dataEnd; 932 if (availableSize < readSize) 933 { 934 size_t growBy = growSize; 935 while (availableSize + growBy < readSize) growBy += growSize; 936 _buffer.length += growBy; 937 } 938 939 /* Read the next block. */ 940 _dataEnd += 941 _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize]) 942 .length; 943 944 found = _buffer[_lineEnd .. _dataEnd].find(terminator); 945 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 946 947 } while (found.empty && !_file.eof); 948 } 949 } 950 } 951 952 assert(file.isOpen, "bufferedByLine passed a closed file."); 953 954 auto r = new BufferedByLineImpl(file); 955 if (!r.empty) r.popFront; 956 return r; 957 } 958 959 // BufferedByLine. 960 unittest 961 { 962 import std.array : appender; 963 import std.conv : to; 964 import std.file : rmdirRecurse, readText; 965 import std.path : buildPath; 966 import std.range : lockstep; 967 import std.stdio; 968 import tsv_utils.common.unittest_utils; 969 970 auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline"); 971 scope(exit) testDir.rmdirRecurse; 972 973 /* Create two data files with the same data. Read both in parallel with byLine and 974 * bufferedByLine and compare each line. 975 */ 976 auto data1 = appender!(char[])(); 977 978 foreach (i; 1 .. 1001) data1.put('\n'); 979 foreach (i; 1 .. 1001) data1.put("a\n"); 980 foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); } 981 foreach (i; 1 .. 1001) 982 { 983 foreach (j; 1 .. i+1) data1.put('x'); 984 data1.put('\n'); 985 } 986 987 string file1a = buildPath(testDir, "file1a.txt"); 988 string file1b = buildPath(testDir, "file1b.txt"); 989 { 990 991 file1a.File("w").write(data1.data); 992 file1b.File("w").write(data1.data); 993 } 994 995 /* Default parameters. */ 996 { 997 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator); 998 auto f1bIn = file1b.File().byLine(No.keepTerminator); 999 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1000 } 1001 { 1002 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator); 1003 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1004 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1005 } 1006 1007 /* Smaller read size. This will trigger buffer growth. */ 1008 { 1009 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', 512, 256); 1010 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1011 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1012 } 1013 1014 /* Exercise boundary cases in buffer growth. 1015 * Note: static-foreach requires DMD 2.076 / LDC 1.6 1016 */ 1017 static foreach (readSize; [1, 2, 4]) 1018 { 1019 static foreach (growSize; 1 .. readSize + 1) 1020 {{ 1021 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1022 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1023 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1024 }} 1025 static foreach (growSize; 1 .. readSize + 1) 1026 {{ 1027 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1028 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1029 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1030 }} 1031 } 1032 1033 1034 /* Files that do not end in a newline. */ 1035 1036 string file2a = buildPath(testDir, "file2a.txt"); 1037 string file2b = buildPath(testDir, "file2b.txt"); 1038 string file3a = buildPath(testDir, "file3a.txt"); 1039 string file3b = buildPath(testDir, "file3b.txt"); 1040 string file4a = buildPath(testDir, "file4a.txt"); 1041 string file4b = buildPath(testDir, "file4b.txt"); 1042 { 1043 file1a.File("w").write("a"); 1044 file1b.File("w").write("a"); 1045 file2a.File("w").write("ab"); 1046 file2b.File("w").write("ab"); 1047 file3a.File("w").write("abc"); 1048 file3b.File("w").write("abc"); 1049 } 1050 1051 static foreach (readSize; [1, 2, 4]) 1052 { 1053 static foreach (growSize; 1 .. readSize + 1) 1054 {{ 1055 auto f1aIn = file1a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1056 auto f1bIn = file1b.File().byLine(No.keepTerminator); 1057 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1058 1059 auto f2aIn = file2a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1060 auto f2bIn = file2b.File().byLine(No.keepTerminator); 1061 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1062 1063 auto f3aIn = file3a.File().bufferedByLine!(No.keepTerminator, char, '\n', readSize, growSize); 1064 auto f3bIn = file3b.File().byLine(No.keepTerminator); 1065 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1066 }} 1067 static foreach (growSize; 1 .. readSize + 1) 1068 {{ 1069 auto f1aIn = file1a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1070 auto f1bIn = file1b.File().byLine(Yes.keepTerminator); 1071 foreach (a, b; lockstep(f1aIn, f1bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1072 1073 auto f2aIn = file2a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1074 auto f2bIn = file2b.File().byLine(Yes.keepTerminator); 1075 foreach (a, b; lockstep(f2aIn, f2bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1076 1077 auto f3aIn = file3a.File().bufferedByLine!(Yes.keepTerminator, char, '\n', readSize, growSize); 1078 auto f3bIn = file3b.File().byLine(Yes.keepTerminator); 1079 foreach (a, b; lockstep(f3aIn, f3bIn, StoppingPolicy.requireSameLength)) assert(a == b); 1080 }} 1081 } 1082 } 1083 1084 /** 1085 joinAppend performs a join operation on an input range, appending the results to 1086 an output range. 1087 1088 joinAppend was written as a performance enhancement over using std.algorithm.joiner 1089 or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower 1090 than std.array.join with writeln. The joiner performance may be due to interaction 1091 with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter 1092 is better, but still substantially slower than join. Using join works reasonably well, 1093 but is allocating memory unnecessarily. 1094 1095 Using joinAppend with Appender is a bit faster than join, and allocates less memory. 1096 The Appender re-uses the underlying data buffer, saving memory. The example below 1097 illustrates. It is a modification of the InputFieldReordering example. The role 1098 Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange 1099 uses a similar technique to buffer multiple lines. 1100 1101 Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has 1102 its own joinAppend method. However, joinAppend remains useful when constructing internal 1103 buffers where BufferedOutputRange is not appropriate. 1104 1105 --- 1106 int main(string[] args) 1107 { 1108 import tsvutil; 1109 import std.algorithm, std.array, std.range, std.stdio; 1110 size_t[] fieldIndicies = [3, 0, 2]; 1111 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 1112 auto outputBuffer = appender!(char[]); 1113 foreach (line; stdin.byLine) 1114 { 1115 fieldReordering.initNewLine; 1116 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 1117 { 1118 fieldReordering.processNextField(fieldIndex, fieldValue); 1119 if (fieldReordering.allFieldsFilled) break; 1120 } 1121 if (fieldReordering.allFieldsFilled) 1122 { 1123 outputBuffer.clear; 1124 writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t'))); 1125 } 1126 else 1127 { 1128 writeln("Error: Insufficient number of field on the line."); 1129 } 1130 } 1131 return 0; 1132 } 1133 --- 1134 */ 1135 OutputRange joinAppend(InputRange, OutputRange, E) 1136 (InputRange inputRange, ref OutputRange outputRange, E delimiter) 1137 if (isInputRange!InputRange && 1138 (is(ElementType!InputRange : const E[]) && 1139 isOutputRange!(OutputRange, E[])) 1140 || 1141 (is(ElementType!InputRange : const E) && 1142 isOutputRange!(OutputRange, E)) 1143 ) 1144 { 1145 if (!inputRange.empty) 1146 { 1147 outputRange.put(inputRange.front); 1148 inputRange.popFront; 1149 } 1150 foreach (x; inputRange) 1151 { 1152 outputRange.put(delimiter); 1153 outputRange.put(x); 1154 } 1155 return outputRange; 1156 } 1157 1158 // joinAppend. 1159 @safe unittest 1160 { 1161 import std.array : appender; 1162 import std.algorithm : equal; 1163 1164 char[] c1 = ['a', 'b', 'c']; 1165 char[] c2 = ['d', 'e', 'f']; 1166 char[] c3 = ['g', 'h', 'i']; 1167 auto cvec = [c1, c2, c3]; 1168 1169 auto s1 = "abc"; 1170 auto s2 = "def"; 1171 auto s3 = "ghi"; 1172 auto svec = [s1, s2, s3]; 1173 1174 auto charAppender = appender!(char[])(); 1175 1176 assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi"); 1177 assert(equal(cvec, [c1, c2, c3])); 1178 1179 charAppender.put('$'); 1180 assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi"); 1181 assert(equal(cvec, [s1, s2, s3])); 1182 1183 charAppender.clear; 1184 assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi"); 1185 1186 auto intAppender = appender!(int[])(); 1187 1188 auto i1 = [100, 101, 102]; 1189 auto i2 = [200, 201, 202]; 1190 auto i3 = [300, 301, 302]; 1191 auto ivec = [i1, i2, i3]; 1192 1193 assert(ivec.joinAppend(intAppender, 0).data == 1194 [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]); 1195 1196 intAppender.clear; 1197 assert(i1.joinAppend(intAppender, 0).data == 1198 [100, 0, 101, 0, 102]); 1199 assert(i2.joinAppend(intAppender, 1).data == 1200 [100, 0, 101, 0, 102, 1201 200, 1, 201, 1, 202]); 1202 assert(i3.joinAppend(intAppender, 2).data == 1203 [100, 0, 101, 0, 102, 1204 200, 1, 201, 1, 202, 1205 300, 2, 301, 2, 302]); 1206 } 1207 1208 /** 1209 getTsvFieldValue extracts the value of a single field from a delimited text string. 1210 1211 This is a convenience function intended for cases when only a single field from an 1212 input line is needed. If multiple values are needed, it will be more efficient to 1213 work directly with std.algorithm.splitter or the InputFieldReordering class. 1214 1215 The input text is split by a delimiter character. The specified field is converted 1216 to the desired type and the value returned. 1217 1218 An exception is thrown if there are not enough fields on the line or if conversion 1219 fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on 1220 failure. If not enough fields, the exception text is generated referencing 1-upped 1221 field numbers as would be provided by command line users. 1222 */ 1223 T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim) 1224 if (isSomeChar!C) 1225 { 1226 import std.algorithm : splitter; 1227 import std.conv : to; 1228 import std.format : format; 1229 import std.range; 1230 1231 auto splitLine = line.splitter(delim); 1232 size_t atField = 0; 1233 1234 while (atField < fieldIndex && !splitLine.empty) 1235 { 1236 splitLine.popFront; 1237 atField++; 1238 } 1239 1240 T val; 1241 if (splitLine.empty) 1242 { 1243 if (fieldIndex == 0) 1244 { 1245 /* This is a workaround to a splitter special case - If the input is empty, 1246 * the returned split range is empty. This doesn't properly represent a single 1247 * column file. More correct mathematically, and for this case, would be a 1248 * single value representing an empty string. The input line is a convenient 1249 * source of an empty line. Info: 1250 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1251 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1252 */ 1253 assert(line.empty); 1254 val = line.to!T; 1255 } 1256 else 1257 { 1258 throw new Exception( 1259 format("Not enough fields on line. Number required: %d; Number found: %d", 1260 fieldIndex + 1, atField)); 1261 } 1262 } 1263 else 1264 { 1265 val = splitLine.front.to!T; 1266 } 1267 1268 return val; 1269 } 1270 1271 // getTsvFieldValue. 1272 @safe unittest 1273 { 1274 import std.conv : ConvException, to; 1275 import std.exception; 1276 1277 /* Common cases. */ 1278 assert(getTsvFieldValue!double("123", 0, '\t') == 123.0); 1279 assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5); 1280 assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123); 1281 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1282 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1283 assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99"); 1284 assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红"); 1285 assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅"); 1286 assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑"); 1287 assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5); 1288 1289 /* The empty field cases. */ 1290 assert(getTsvFieldValue!string("", 0, '\t') == ""); 1291 assert(getTsvFieldValue!string("\t", 0, '\t') == ""); 1292 assert(getTsvFieldValue!string("\t", 1, '\t') == ""); 1293 assert(getTsvFieldValue!string("", 0, ':') == ""); 1294 assert(getTsvFieldValue!string(":", 0, ':') == ""); 1295 assert(getTsvFieldValue!string(":", 1, ':') == ""); 1296 1297 /* Tests with different data types. */ 1298 string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5"; 1299 char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]); 1300 dchar[] dcharLine = stringLine.to!(dchar[]); 1301 wchar[] wcharLine = stringLine.to!(wchar[]); 1302 1303 assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black"); 1304 assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1305 assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1306 assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5); 1307 1308 assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black"); 1309 assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1310 assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1311 assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5); 1312 1313 assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black"); 1314 assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1315 assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1316 assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5); 1317 1318 assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black"); 1319 assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1320 assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1321 assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5); 1322 1323 /* Conversion errors. */ 1324 assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t')); 1325 assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|')); 1326 assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|')); 1327 assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|')); 1328 assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|')); 1329 1330 /* Not enough field errors. These should throw, but not a ConvException.*/ 1331 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t'))); 1332 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t'))); 1333 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t'))); 1334 } 1335 1336 /** [Yes|No.newlineWasRemoved] is a template parameter to throwIfWindowsNewlineOnUnix. 1337 * A Yes value indicates the Unix newline was already removed, as might be done via 1338 * std.File.byLine or similar mechanism. 1339 */ 1340 alias NewlineWasRemoved = Flag!"newlineWasRemoved"; 1341 1342 /** 1343 throwIfWindowsLineNewlineOnUnix is used to throw an exception if a Windows/DOS 1344 line ending is found on a build compiled for a Unix platform. This is used by 1345 the TSV Utilities to detect Window/DOS line endings and terminate processing 1346 with an error message to the user. 1347 */ 1348 void throwIfWindowsNewlineOnUnix 1349 (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved) 1350 (const char[] line, const char[] filename, size_t lineNum) 1351 { 1352 version(Posix) 1353 { 1354 static if (nlWasRemoved) 1355 { 1356 immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r'; 1357 } 1358 else 1359 { 1360 immutable bool hasWindowsLineEnding = 1361 line.length > 1 && 1362 line[$ - 2] == '\r' && 1363 line[$ - 1] == '\n'; 1364 } 1365 1366 if (hasWindowsLineEnding) 1367 { 1368 import std.format; 1369 throw new Exception( 1370 format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n File: %s, Line: %s", 1371 (filename == "-") ? "Standard Input" : filename, lineNum)); 1372 } 1373 } 1374 } 1375 1376 // throwIfWindowsNewlineOnUnix 1377 @safe unittest 1378 { 1379 /* Note: Currently only building on Posix. Need to add non-Posix test cases 1380 * if Windows builds are ever done. 1381 */ 1382 version(Posix) 1383 { 1384 import std.exception; 1385 1386 assertNotThrown(throwIfWindowsNewlineOnUnix("", "afile.tsv", 1)); 1387 assertNotThrown(throwIfWindowsNewlineOnUnix("a", "afile.tsv", 2)); 1388 assertNotThrown(throwIfWindowsNewlineOnUnix("ab", "afile.tsv", 3)); 1389 assertNotThrown(throwIfWindowsNewlineOnUnix("abc", "afile.tsv", 4)); 1390 1391 assertThrown(throwIfWindowsNewlineOnUnix("\r", "afile.tsv", 1)); 1392 assertThrown(throwIfWindowsNewlineOnUnix("a\r", "afile.tsv", 2)); 1393 assertThrown(throwIfWindowsNewlineOnUnix("ab\r", "afile.tsv", 3)); 1394 assertThrown(throwIfWindowsNewlineOnUnix("abc\r", "afile.tsv", 4)); 1395 1396 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\n", "afile.tsv", 1)); 1397 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\n", "afile.tsv", 2)); 1398 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3)); 1399 assertNotThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4)); 1400 1401 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5)); 1402 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6)); 1403 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7)); 1404 assertThrown(throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8)); 1405 1406 /* Standard Input formatting. */ 1407 import std.algorithm : endsWith; 1408 bool exceptionCaught = false; 1409 1410 try (throwIfWindowsNewlineOnUnix("\r", "-", 99)); 1411 catch (Exception e) 1412 { 1413 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1414 exceptionCaught = true; 1415 } 1416 finally 1417 { 1418 assert(exceptionCaught); 1419 exceptionCaught = false; 1420 } 1421 1422 try (throwIfWindowsNewlineOnUnix!(No.newlineWasRemoved)("\r\n", "-", 99)); 1423 catch (Exception e) 1424 { 1425 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1426 exceptionCaught = true; 1427 } 1428 finally 1429 { 1430 assert(exceptionCaught); 1431 exceptionCaught = false; 1432 } 1433 } 1434 } 1435 1436 /** Flag used by InputSourceRange to determine if the header line should be when 1437 opening a file. 1438 */ 1439 alias ReadHeader = Flag!"readHeader"; 1440 1441 /** 1442 inputSourceRange is a helper function for creating new InputSourceRange objects. 1443 */ 1444 InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader) 1445 { 1446 return new InputSourceRange(filepaths, readHeader); 1447 } 1448 1449 /** 1450 InputSourceRange is an input range that iterates over a set of input files. 1451 1452 InputSourceRange is used to iterate over a set of files passed on the command line. 1453 Files are automatically opened and closed during iteration. The caller can choose to 1454 have header lines read automatically. 1455 1456 The range is created from a set of filepaths. These filepaths are mapped to 1457 InputSource objects during the iteration. This is what enables automatically opening 1458 and closing files and reading the header line. 1459 1460 The motivation for an InputSourceRange is to provide a standard way to look at the 1461 header line of the first input file during command line argument processing, and then 1462 pass the open input file and the header line along to the main processing functions. 1463 This enables a features like named fields to be implemented in a standard way. 1464 1465 Both InputSourceRange and InputSource are reference objects. This keeps their use 1466 limited to a single iteration over the set of files. The files can be iterated again 1467 by creating a new InputSourceRange against the same filepaths. 1468 1469 Currently, InputSourceRange supports files and standard input. It is possible other 1470 types of input sources will be added in the future. 1471 */ 1472 final class InputSourceRange 1473 { 1474 import std.range; 1475 1476 private string[] _filepaths; 1477 private ReadHeader _readHeader; 1478 private InputSource _front; 1479 1480 this(string[] filepaths, ReadHeader readHeader) 1481 { 1482 _filepaths = filepaths.dup; 1483 _readHeader = readHeader; 1484 _front = null; 1485 1486 if (!_filepaths.empty) 1487 { 1488 _front = new InputSource(_filepaths.front, _readHeader); 1489 _front.open; 1490 _filepaths.popFront; 1491 } 1492 } 1493 1494 size_t length() const pure nothrow @safe 1495 { 1496 return empty ? 0 : _filepaths.length + 1; 1497 } 1498 1499 bool empty() const pure nothrow @safe 1500 { 1501 return _front is null; 1502 } 1503 1504 InputSource front() pure @safe 1505 { 1506 assert(!empty, "Attempt to take the front of an empty InputSourceRange"); 1507 return _front; 1508 } 1509 1510 void popFront() 1511 { 1512 assert(!empty, "Attempt to popFront an empty InputSourceRange"); 1513 1514 _front.close; 1515 1516 if (!_filepaths.empty) 1517 { 1518 _front = new InputSource(_filepaths.front, _readHeader); 1519 _front.open; 1520 _filepaths.popFront; 1521 } 1522 else 1523 { 1524 _front = null; 1525 } 1526 } 1527 } 1528 1529 /** 1530 InputSource is a class of objects produced by iterating over an InputSourceRange. 1531 1532 An InputSource object provides access to the open file currently the front element 1533 of an InputSourceRange. The main methods application code is likely to need are: 1534 1535 $(LIST 1536 * `file()` - Returns the File object. The file will be open for reading as long 1537 InputSource instance is the front element of the InputSourceRange it came from. 1538 1539 * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the 1540 header line from the file. An empty string is returned if InputSource range 1541 was created with readHeader=false. 1542 1543 * `name()` - The name of the input source. The name returned is intended for 1544 user error messages. For files, this is the filepath that was passed to 1545 InputSourceRange. For standard input, it is "Standard Input". 1546 ) 1547 1548 An InputSource is a reference object, so the copies will retain the state of the 1549 InputSourceRange front element. In particular, all copies will have the open 1550 state of the front element of the InputSourceRange. 1551 1552 This class is not intended for use outside the context of an InputSourceRange. 1553 */ 1554 final class InputSource 1555 { 1556 import std.range; 1557 import std.stdio; 1558 1559 private immutable string _filepath; 1560 private immutable bool _isStdin; 1561 private bool _isOpen; 1562 private ReadHeader _readHeader; 1563 private bool _hasBeenOpened; 1564 private string _header; 1565 private File _file; 1566 1567 private this(string filepath, ReadHeader readHeader) pure nothrow @safe 1568 { 1569 _filepath = filepath; 1570 _isStdin = filepath == "-"; 1571 _isOpen = false; 1572 _readHeader = readHeader; 1573 _hasBeenOpened = false; 1574 } 1575 1576 /** file returns the File object held by the InputSource. 1577 * 1578 * The File will be open for reading as long as the InputSource instance is the 1579 * front element of the InputSourceRange it came from. 1580 */ 1581 File file() nothrow @safe 1582 { 1583 return _file; 1584 } 1585 1586 /** isReadHeaderEnabled returns true if the header line is being read. 1587 */ 1588 bool isReadHeaderEnabled() const pure nothrow @safe 1589 { 1590 return _readHeader == Yes.readHeader; 1591 } 1592 1593 /** header returns the header line from the input file. 1594 * 1595 * An empty string is returned if InputSource range was created with 1596 * readHeader=false. 1597 */ 1598 string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe 1599 { 1600 assert(_hasBeenOpened); 1601 return (keepTerminator == Yes.keepTerminator || 1602 _header.length == 0 || 1603 _header[$ - 1] != '\n') ? 1604 _header : _header[0 .. $-1]; 1605 } 1606 1607 /** isHeaderEmpty returns true if there is no data for a header, including the 1608 * terminator. 1609 * 1610 * When headers are being read, this true only if the file is empty. 1611 */ 1612 bool isHeaderEmpty() const pure nothrow @safe 1613 { 1614 assert(_hasBeenOpened); 1615 return _header.empty; 1616 } 1617 1618 /** name returns a user friendly name representing the input source. 1619 * 1620 * For files, it is the filepath provided to InputSourceRange. For standard 1621 * input, it is "Standard Input". (Use isStdin() to test for standard input, 1622 * not name(). 1623 */ 1624 string name() const pure nothrow @safe 1625 { 1626 return _isStdin ? "Standard Input" : _filepath; 1627 } 1628 1629 /** isStdin returns true if the input source is Standard Input, false otherwise. 1630 */ 1631 bool isStdin() const pure nothrow @safe 1632 { 1633 return _isStdin; 1634 } 1635 1636 /** isOpen returns true if the input source is open for reading, false otherwise. 1637 * 1638 * "Open" in this context is whether the InputSource object is currently open, 1639 * meaning that it is the front element of the InputSourceRange that created it. 1640 * 1641 * For files, this is also reflected in the state of the underlying File object. 1642 * However, standard input is never actually closed. 1643 */ 1644 bool isOpen() const pure nothrow @safe 1645 { 1646 return _isOpen; 1647 } 1648 1649 private void open() 1650 { 1651 assert(!_isOpen); 1652 assert(!_hasBeenOpened); 1653 1654 _file = isStdin ? stdin : _filepath.File("rb"); 1655 if (_readHeader) _header = _file.readln; 1656 _isOpen = true; 1657 _hasBeenOpened = true; 1658 } 1659 1660 private void close() 1661 { 1662 if (!_isStdin) _file.close; 1663 _isOpen = false; 1664 } 1665 } 1666 1667 // InputSourceRange and InputSource 1668 unittest 1669 { 1670 import std.algorithm : all, each; 1671 import std.array : appender; 1672 import std.exception : assertThrown; 1673 import std.file : rmdirRecurse; 1674 import std.path : buildPath; 1675 import std.range; 1676 import std.stdio; 1677 import tsv_utils.common.unittest_utils; 1678 1679 auto testDir = makeUnittestTempDir("tsv_utils_input_source_range"); 1680 scope(exit) testDir.rmdirRecurse; 1681 1682 string file0 = buildPath(testDir, "file0.txt"); 1683 string file1 = buildPath(testDir, "file1.txt"); 1684 string file2 = buildPath(testDir, "file2.txt"); 1685 string file3 = buildPath(testDir, "file3.txt"); 1686 1687 string file0Header = ""; 1688 string file1Header = "file 1 header\n"; 1689 string file2Header = "file 2 header\n"; 1690 string file3Header = "file 3 header\n"; 1691 1692 string file0Body = ""; 1693 string file1Body = ""; 1694 string file2Body = "file 2 line 1\n"; 1695 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 1696 1697 string file0Data = file0Header ~ file0Body; 1698 string file1Data = file1Header ~ file1Body; 1699 string file2Data = file2Header ~ file2Body; 1700 string file3Data = file3Header ~ file3Body; 1701 1702 { 1703 file0.File("w").write(file0Data); 1704 file1.File("w").write(file1Data); 1705 file2.File("w").write(file2Data); 1706 file3.File("w").write(file3Data); 1707 } 1708 1709 auto inputFiles = [file0, file1, file2, file3]; 1710 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 1711 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 1712 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 1713 1714 auto readSources = appender!(InputSource[]); 1715 auto buffer = new char[1024]; // Must be large enough to hold the test files. 1716 1717 /* Tests without standard input. Don't want to count on state of standard 1718 * input or modifying it when doing unit tests, so avoid reading from it. 1719 */ 1720 1721 foreach(numFiles; 1 .. inputFiles.length + 1) 1722 { 1723 /* Reading headers. */ 1724 1725 readSources.clear; 1726 auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader); 1727 assert(inputSourcesYesHeader.length == numFiles); 1728 1729 foreach(fileNum, source; inputSourcesYesHeader.enumerate) 1730 { 1731 readSources.put(source); 1732 assert(source.isOpen); 1733 assert(source.file.isOpen); 1734 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 1735 assert(readSources.data[fileNum].isOpen); 1736 1737 assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]); 1738 1739 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 1740 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 1741 assert(source.header(No.keepTerminator) == 1742 fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 1743 1744 assert(source.name == inputFiles[fileNum]); 1745 assert(!source.isStdin); 1746 assert(source.isReadHeaderEnabled); 1747 1748 assert(source.file.rawRead(buffer) == fileBodies[fileNum]); 1749 } 1750 1751 /* The InputSourceRange is a reference range, consumed by the foreach. */ 1752 assert(inputSourcesYesHeader.empty); 1753 1754 /* Without reading headers. */ 1755 1756 readSources.clear; 1757 auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader); 1758 assert(inputSourcesNoHeader.length == numFiles); 1759 1760 foreach(fileNum, source; inputSourcesNoHeader.enumerate) 1761 { 1762 readSources.put(source); 1763 assert(source.isOpen); 1764 assert(source.file.isOpen); 1765 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 1766 assert(readSources.data[fileNum].isOpen); 1767 1768 assert(source.header(Yes.keepTerminator).empty); 1769 assert(source.header(No.keepTerminator).empty); 1770 1771 assert(source.name == inputFiles[fileNum]); 1772 assert(!source.isStdin); 1773 assert(!source.isReadHeaderEnabled); 1774 1775 assert(source.file.rawRead(buffer) == fileData[fileNum]); 1776 } 1777 1778 /* The InputSourceRange is a reference range, consumed by the foreach. */ 1779 assert(inputSourcesNoHeader.empty); 1780 } 1781 1782 /* Tests with standard input. No actual reading in these tests. 1783 */ 1784 1785 readSources.clear; 1786 foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate) 1787 { 1788 readSources.put(source); 1789 assert(source.isOpen); 1790 assert(source.file.isOpen); 1791 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); // InputSource objects are "closed". 1792 assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen)); // Actual stdin should not be closed. 1793 assert(readSources.data[fileNum].isOpen); 1794 1795 assert(source.header(Yes.keepTerminator).empty); 1796 assert(source.header(No.keepTerminator).empty); 1797 1798 assert(source.name == "Standard Input"); 1799 assert(source.isStdin); 1800 } 1801 1802 /* Empty filelist. */ 1803 string[] nofiles; 1804 { 1805 auto sources = inputSourceRange(nofiles, No.readHeader); 1806 assert(sources.empty); 1807 } 1808 { 1809 auto sources = inputSourceRange(nofiles, Yes.readHeader); 1810 assert(sources.empty); 1811 } 1812 1813 /* Error cases. */ 1814 assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each); 1815 assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each); 1816 } 1817 1818 /** 1819 byLineSourceRange is a helper function for creating new byLineSourceRange objects. 1820 */ 1821 auto byLineSourceRange( 1822 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 1823 (string[] filepaths) 1824 if (is(Char == char) || is(Char == ubyte)) 1825 { 1826 return new ByLineSourceRange!(keepTerminator, Char, terminator)(filepaths); 1827 } 1828 1829 /** 1830 ByLineSourceRange is an input range that iterates over a set of input files. It 1831 provides bufferedByLine access to each file. 1832 1833 A ByLineSourceRange is used to iterate over a set of files passed on the command line. 1834 Files are automatically opened and closed during iteration. The front element of the 1835 range provides access to a bufferedByLine for iterating over the lines in the file. 1836 1837 The range is created from a set of filepaths. These filepaths are mapped to 1838 ByLineSource objects during the iteration. This is what enables automatically opening 1839 and closing files and providing bufferedByLine access. 1840 1841 The motivation behind ByLineSourceRange is to provide a standard way to look at the 1842 header line of the first input file during command line argument processing, and then 1843 pass the open input file along to the main processing functions. This enables 1844 features like named fields to be implemented in a standard way. 1845 1846 Access to the first line of the first file is available after creating the 1847 ByLineSourceRange instance. The first file is opened and a bufferedByLine created. 1848 The first line of the first file is via byLine.front (after checking !byLine.empty). 1849 1850 Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use 1851 limited to a single iteration over the set of files. The files can be iterated again 1852 by creating a new InputSourceRange against the same filepaths. 1853 1854 Currently, ByLineSourceRange supports files and standard input. It is possible other 1855 types of input sources will be added in the future. 1856 */ 1857 final class ByLineSourceRange( 1858 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 1859 if (is(Char == char) || is(Char == ubyte)) 1860 { 1861 import std.range; 1862 1863 alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator); 1864 1865 private string[] _filepaths; 1866 private ByLineSourceType _front; 1867 1868 this(string[] filepaths) 1869 { 1870 _filepaths = filepaths.dup; 1871 _front = null; 1872 1873 if (!_filepaths.empty) 1874 { 1875 _front = new ByLineSourceType(_filepaths.front); 1876 _front.open; 1877 _filepaths.popFront; 1878 } 1879 } 1880 1881 size_t length() const pure nothrow @safe 1882 { 1883 return empty ? 0 : _filepaths.length + 1; 1884 } 1885 1886 bool empty() const pure nothrow @safe 1887 { 1888 return _front is null; 1889 } 1890 1891 ByLineSourceType front() pure @safe 1892 { 1893 assert(!empty, "Attempt to take the front of an empty ByLineSourceRange"); 1894 return _front; 1895 } 1896 1897 void popFront() 1898 { 1899 assert(!empty, "Attempt to popFront an empty ByLineSourceRange"); 1900 1901 _front.close; 1902 1903 if (!_filepaths.empty) 1904 { 1905 _front = new ByLineSourceType(_filepaths.front); 1906 _front.open; 1907 _filepaths.popFront; 1908 } 1909 else 1910 { 1911 _front = null; 1912 } 1913 } 1914 } 1915 1916 /** 1917 ByLineSource is a class of objects produced by iterating over an ByLineSourceRange. 1918 1919 A ByLineSource instance provides a bufferedByLine range for the current the front 1920 element of a ByLineSourceRange. The main methods application code is likely to 1921 need are: 1922 1923 $(LIST 1924 * `byLine()` - Returns the bufferedByLine range accessing the open file. The file 1925 will be open for reading (using the bufferedByLine range) as long as the 1926 ByLineSource instance is the front element of the ByLineSourceRange 1927 it came from. 1928 1929 * `name()` - The name of the input source. The name returned is intended for 1930 user error messages. For files, this is the filepath that was passed to 1931 ByLineSourceRange. For standard input, it is "Standard Input". 1932 ) 1933 1934 A ByLineSource is a reference object, so the copies have the same state as the 1935 ByLineSourceRange front element. In particular, all copies will have the open 1936 state of the front element of the ByLineSourceRange. 1937 1938 This class is not intended for use outside the context of an ByLineSourceRange. 1939 */ 1940 final class ByLineSource( 1941 KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n') 1942 if (is(Char == char) || is(Char == ubyte)) 1943 { 1944 import std.range; 1945 import std.stdio; 1946 import std.traits : ReturnType; 1947 1948 alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator); 1949 alias ByLineType = ReturnType!newByLineFn; 1950 1951 private immutable string _filepath; 1952 private immutable bool _isStdin; 1953 private bool _isOpen; 1954 private bool _hasBeenOpened; 1955 private File _file; 1956 private ByLineType _byLineRange; 1957 1958 private this(string filepath) pure nothrow @safe 1959 { 1960 _filepath = filepath; 1961 _isStdin = filepath == "-"; 1962 _isOpen = false; 1963 _hasBeenOpened = false; 1964 } 1965 1966 /** byLine returns the bufferedByLine object held by the ByLineSource instance. 1967 * 1968 * The File underlying the BufferedByLine object is open for reading as long as 1969 * the ByLineSource instance is the front element of the ByLineSourceRange it 1970 * came from. 1971 */ 1972 ByLineType byLine() nothrow @safe 1973 { 1974 return _byLineRange; 1975 } 1976 1977 /** name returns a user friendly name representing the underlying input source. 1978 * 1979 * For files, it is the filepath provided to ByLineSourceRange. For standard 1980 * input, it is "Standard Input". (Use isStdin() to test for standard input, 1981 * compare against name().) 1982 */ 1983 string name() const pure nothrow @safe 1984 { 1985 return _isStdin ? "Standard Input" : _filepath; 1986 } 1987 1988 /** isStdin returns true if the underlying input source is Standard Input, false 1989 * otherwise. 1990 */ 1991 bool isStdin() const pure nothrow @safe 1992 { 1993 return _isStdin; 1994 } 1995 1996 /** isOpen returns true if the ByLineSource instance is open for reading, false 1997 * otherwise. 1998 * 1999 * "Open" in this context is whether the ByLineSource object is currently "open". 2000 * The underlying input source backing it does not necessarily have the same 2001 * state. The ByLineSource instance is "open" if is the front element of the 2002 * ByLineSourceRange that created it. 2003 * 2004 * The underlying input source object follows the same open/close state as makes 2005 * sense. In particular, real files are closed when the ByLineSource object is 2006 * closed. The exception is standard input, which is never actually closed. 2007 */ 2008 bool isOpen() const pure nothrow @safe 2009 { 2010 return _isOpen; 2011 } 2012 2013 private void open() 2014 { 2015 assert(!_isOpen); 2016 assert(!_hasBeenOpened); 2017 2018 _file = isStdin ? stdin : _filepath.File("rb"); 2019 _byLineRange = newByLineFn(_file); 2020 _isOpen = true; 2021 _hasBeenOpened = true; 2022 } 2023 2024 private void close() 2025 { 2026 if (!_isStdin) _file.close; 2027 _isOpen = false; 2028 } 2029 } 2030 2031 // ByLineSourceRange and ByLineSource 2032 unittest 2033 { 2034 import std.algorithm : all, each; 2035 import std.array : appender; 2036 import std.exception : assertThrown; 2037 import std.file : rmdirRecurse; 2038 import std.path : buildPath; 2039 import std.range; 2040 import std.stdio; 2041 import tsv_utils.common.unittest_utils; 2042 2043 auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range"); 2044 scope(exit) testDir.rmdirRecurse; 2045 2046 string file0 = buildPath(testDir, "file0.txt"); 2047 string file1 = buildPath(testDir, "file1.txt"); 2048 string file2 = buildPath(testDir, "file2.txt"); 2049 string file3 = buildPath(testDir, "file3.txt"); 2050 2051 string file0Header = ""; 2052 string file1Header = "file 1 header\n"; 2053 string file2Header = "file 2 header\n"; 2054 string file3Header = "file 3 header\n"; 2055 2056 string file0Body = ""; 2057 string file1Body = ""; 2058 string file2Body = "file 2 line 1\n"; 2059 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 2060 2061 string file0Data = file0Header ~ file0Body; 2062 string file1Data = file1Header ~ file1Body; 2063 string file2Data = file2Header ~ file2Body; 2064 string file3Data = file3Header ~ file3Body; 2065 2066 { 2067 file0.File("w").write(file0Data); 2068 file1.File("w").write(file1Data); 2069 file2.File("w").write(file2Data); 2070 file3.File("w").write(file3Data); 2071 } 2072 2073 auto inputFiles = [file0, file1, file2, file3]; 2074 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 2075 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 2076 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 2077 2078 auto buffer = new char[1024]; // Must be large enough to hold the test files. 2079 2080 /* Tests without standard input. Don't want to count on state of standard 2081 * input or modifying it when doing unit tests, so avoid reading from it. 2082 */ 2083 2084 auto readSourcesNoTerminator = appender!(ByLineSource!(No.keepTerminator)[]); 2085 auto readSourcesYesTerminator = appender!(ByLineSource!(Yes.keepTerminator)[]); 2086 2087 foreach(numFiles; 1 .. inputFiles.length + 1) 2088 { 2089 /* Using No.keepTerminator. */ 2090 readSourcesNoTerminator.clear; 2091 auto inputSourcesNoTerminator = byLineSourceRange!(No.keepTerminator)(inputFiles[0 .. numFiles]); 2092 assert(inputSourcesNoTerminator.length == numFiles); 2093 2094 foreach(fileNum, source; inputSourcesNoTerminator.enumerate) 2095 { 2096 readSourcesNoTerminator.put(source); 2097 assert(source.isOpen); 2098 assert(source._file.isOpen); 2099 assert(readSourcesNoTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2100 assert(readSourcesNoTerminator.data[fileNum].isOpen); 2101 2102 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 2103 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 2104 2105 assert(source.byLine.empty || 2106 source.byLine.front == fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 2107 2108 assert(source.name == inputFiles[fileNum]); 2109 assert(!source.isStdin); 2110 2111 auto readFileData = appender!(char[]); 2112 foreach(line; source.byLine) 2113 { 2114 readFileData.put(line); 2115 readFileData.put('\n'); 2116 } 2117 2118 assert(readFileData.data == fileData[fileNum]); 2119 } 2120 2121 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2122 assert(inputSourcesNoTerminator.empty); 2123 2124 /* Using Yes.keepTerminator. */ 2125 readSourcesYesTerminator.clear; 2126 auto inputSourcesYesTerminator = byLineSourceRange!(Yes.keepTerminator)(inputFiles[0 .. numFiles]); 2127 assert(inputSourcesYesTerminator.length == numFiles); 2128 2129 foreach(fileNum, source; inputSourcesYesTerminator.enumerate) 2130 { 2131 readSourcesYesTerminator.put(source); 2132 assert(source.isOpen); 2133 assert(source._file.isOpen); 2134 assert(readSourcesYesTerminator.data[0 .. fileNum].all!(s => !s.isOpen)); 2135 assert(readSourcesYesTerminator.data[fileNum].isOpen); 2136 2137 assert(source.byLine.empty || source.byLine.front == fileHeaders[fileNum]); 2138 2139 assert(source.name == inputFiles[fileNum]); 2140 assert(!source.isStdin); 2141 2142 auto readFileData = appender!(char[]); 2143 foreach(line; source.byLine) 2144 { 2145 readFileData.put(line); 2146 } 2147 2148 assert(readFileData.data == fileData[fileNum]); 2149 } 2150 2151 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2152 assert(inputSourcesYesTerminator.empty); 2153 } 2154 2155 /* Empty filelist. */ 2156 string[] nofiles; 2157 { 2158 auto sources = byLineSourceRange!(No.keepTerminator)(nofiles); 2159 assert(sources.empty); 2160 } 2161 { 2162 auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles); 2163 assert(sources.empty); 2164 } 2165 2166 /* Error cases. */ 2167 assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each); 2168 assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each); 2169 }