1 /** 2 Utilities used by tsv-utils applications. InputFieldReordering, BufferedOutputRange, 3 and a several others. 4 5 Utilities in this file: 6 $(LIST 7 * [InputFieldReordering] - A class that creates a reordered subset of fields from 8 an input line. Fields in the subset are accessed by array indicies. This is 9 especially useful when processing the subset in a specific order, such as the 10 order listed on the command-line at run-time. 11 12 * [BufferedOutputRange] - An OutputRange with an internal buffer used to buffer 13 output. Intended for use with stdout, it is a significant performance benefit. 14 15 * [isFlushableOutputRange] - Tests if something is an OutputRange with a flush 16 member. 17 18 * [bufferedByLine] - An input range that reads from a File handle line by line. 19 It is similar to the standard library method std.stdio.File.byLine, but quite a 20 bit faster. This is achieved by reading in larger blocks and buffering. 21 22 * [InputSourceRange] - An input range that provides open file access to a set of 23 files. It is used to iterate over files passed as command line arguments. This 24 enable reading header line of a file during command line argument process, then 25 passing the open file to the main processing functions. 26 27 * [ByLineSourceRange] - Similar to an InputSourceRange, except that it provides 28 access to a byLine iterator (bufferedByLine) rather than an open file. This is 29 used by tools that run the same processing logic both header non-header lines. 30 31 * [isBufferableInputSource] - Tests if a file or input range can be read in a 32 buffered fashion by inputSourceByChunk. 33 34 * [inputSourceByChunk] - Returns a range that reads from a file handle (File) or 35 a ubyte input range a chunk at a time. 36 37 * [joinAppend] - A function that performs a join, but appending the join output to 38 an output stream. It is a performance improvement over using join or joiner with 39 writeln. 40 41 * [getTsvFieldValue] - A convenience function when only a single value is needed 42 from an input line. 43 44 * [throwIfWindowsNewline] - A utility for detecting Windows newlines in input. 45 ) 46 47 Copyright (c) 2015-2021, eBay Inc. 48 Initially written by Jon Degenhardt 49 50 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 51 */ 52 53 module tsv_utils.common.utils; 54 55 import std.range; 56 import std.stdio : File, isFileHandle, KeepTerminator; 57 import std.traits : isIntegral, isSomeChar, isSomeString, isUnsigned, ReturnType, Unqual; 58 import std.typecons : Flag, No, Yes; 59 60 // InputFieldReording class. 61 62 /** Flag used by the InputFieldReordering template. */ 63 alias EnablePartialLines = Flag!"enablePartialLines"; 64 65 /** 66 InputFieldReordering - Move select fields from an input line to an output array, 67 reordering along the way. 68 69 The InputFieldReordering class is used to reorder a subset of fields from an input line. 70 The caller instantiates an InputFieldReordering object at the start of input processing. 71 The instance contains a mapping from input index to output index, plus a buffer holding 72 the reordered fields. The caller processes each input line by calling initNewLine, 73 splitting the line into fields, and calling processNextField on each field. The output 74 buffer is ready when the allFieldsFilled method returns true. 75 76 Fields are not copied, instead the output buffer points to the fields passed by the caller. 77 The caller needs to use or copy the output buffer while the fields are still valid, which 78 is normally until reading the next input line. The program below illustrates the basic use 79 case. It reads stdin and outputs fields [3, 0, 2], in that order. (See also joinAppend, 80 below, which has a performance improvement over join used here.) 81 82 --- 83 int main(string[] args) 84 { 85 import tsv_utils.common.utils; 86 import std.algorithm, std.array, std.range, std.stdio; 87 size_t[] fieldIndicies = [3, 0, 2]; 88 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 89 foreach (line; stdin.byLine) 90 { 91 fieldReordering.initNewLine; 92 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 93 { 94 fieldReordering.processNextField(fieldIndex, fieldValue); 95 if (fieldReordering.allFieldsFilled) break; 96 } 97 if (fieldReordering.allFieldsFilled) 98 { 99 writeln(fieldReordering.outputFields.join('\t')); 100 } 101 else 102 { 103 writeln("Error: Insufficient number of field on the line."); 104 } 105 } 106 return 0; 107 } 108 --- 109 110 Field indicies are zero-based. An individual field can be listed multiple times. The 111 outputFields array is not valid until all the specified fields have been processed. The 112 allFieldsFilled method tests this. If a line does not have enough fields the outputFields 113 buffer cannot be used. For most TSV applications this is okay, as it means the line is 114 invalid and cannot be used. However, if partial lines are okay, the template can be 115 instantiated with EnablePartialLines.yes. This will ensure that any fields not filled-in 116 are empty strings in the outputFields return. 117 */ 118 final class InputFieldReordering(C, EnablePartialLines partialLinesOk = EnablePartialLines.no) 119 if (isSomeChar!C) 120 { 121 /* Implementation: The class works by creating an array of tuples mapping the input 122 * field index to the location in the outputFields array. The 'fromToMap' array is 123 * sorted in input field order, enabling placement in the outputFields buffer during a 124 * pass over the input fields. The map is created by the constructor. An example: 125 * 126 * inputFieldIndicies: [3, 0, 7, 7, 1, 0, 9] 127 * fromToMap: [<0,1>, <0,5>, <1,4>, <3,0>, <7,2>, <7,3>, <9,6>] 128 * 129 * During processing of an a line, an array slice, mapStack, is used to track how 130 * much of the fromToMap remains to be processed. 131 */ 132 import std.typecons : Tuple; 133 134 alias TupleFromTo = Tuple!(size_t, "from", size_t, "to"); 135 136 private C[][] outputFieldsBuf; 137 private TupleFromTo[] fromToMap; 138 private TupleFromTo[] mapStack; 139 140 final this(const ref size_t[] inputFieldIndicies, size_t start = 0) pure nothrow @safe 141 { 142 import std.algorithm : sort; 143 144 outputFieldsBuf = new C[][](inputFieldIndicies.length); 145 fromToMap.reserve(inputFieldIndicies.length); 146 147 foreach (to, from; inputFieldIndicies.enumerate(start)) 148 { 149 fromToMap ~= TupleFromTo(from, to); 150 } 151 152 sort(fromToMap); 153 initNewLine; 154 } 155 156 /** initNewLine initializes the object for a new line. */ 157 final void initNewLine() pure nothrow @safe 158 { 159 mapStack = fromToMap; 160 static if (partialLinesOk) 161 { 162 import std.algorithm : each; 163 outputFieldsBuf.each!((ref s) => s.length = 0); 164 } 165 } 166 167 /** processNextField maps an input field to the correct locations in the 168 * outputFields array. 169 * 170 * processNextField should be called once for each field on the line, in the order 171 * found. The processing of the line can terminate once allFieldsFilled returns 172 * true. 173 * 174 * The return value is the number of output fields the input field maps to. Zero 175 * means the field is not mapped to the output fields array. 176 * 177 * If, prior to allFieldsProcessed returning true, any fields on the input line 178 * are not passed to processNextField, the caller should either ensure the fields 179 * are not part of the output fields or have partial lines enabled. 180 */ 181 final size_t processNextField(size_t fieldIndex, C[] fieldValue) pure nothrow @safe @nogc 182 { 183 size_t numFilled = 0; 184 while (!mapStack.empty && fieldIndex == mapStack.front.from) 185 { 186 outputFieldsBuf[mapStack.front.to] = fieldValue; 187 mapStack.popFront; 188 numFilled++; 189 } 190 return numFilled; 191 } 192 193 /** allFieldsFilled returned true if all fields expected have been processed. */ 194 final bool allFieldsFilled() const pure nothrow @safe @nogc 195 { 196 return mapStack.empty; 197 } 198 199 /** outputFields is the assembled output fields. Unless partial lines are enabled, 200 * it is only valid after allFieldsFilled is true. 201 */ 202 final C[][] outputFields() pure nothrow @safe @nogc 203 { 204 return outputFieldsBuf[]; 205 } 206 } 207 208 // InputFieldReordering - Tests using different character types. 209 @safe unittest 210 { 211 import std.conv : to; 212 213 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 214 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 215 ["r3f0", "123", "456", "789"]]; 216 217 size_t[] fields_2_0 = [2, 0]; 218 219 auto expected_2_0 = [["r1f2", "r1f0"], 220 ["ÀBCßßZ", "r2f0"], 221 ["456", "r3f0"]]; 222 223 char[][][] charExpected_2_0 = to!(char[][][])(expected_2_0); 224 wchar[][][] wcharExpected_2_0 = to!(wchar[][][])(expected_2_0); 225 dchar[][][] dcharExpected_2_0 = to!(dchar[][][])(expected_2_0); 226 dstring[][] dstringExpected_2_0 = to!(dstring[][])(expected_2_0); 227 228 auto charIFR = new InputFieldReordering!char(fields_2_0); 229 auto wcharIFR = new InputFieldReordering!wchar(fields_2_0); 230 auto dcharIFR = new InputFieldReordering!dchar(fields_2_0); 231 232 foreach (lineIndex, line; inputLines) 233 { 234 charIFR.initNewLine; 235 wcharIFR.initNewLine; 236 dcharIFR.initNewLine; 237 238 foreach (fieldIndex, fieldValue; line) 239 { 240 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 241 wcharIFR.processNextField(fieldIndex, to!(wchar[])(fieldValue)); 242 dcharIFR.processNextField(fieldIndex, to!(dchar[])(fieldValue)); 243 244 assert ((fieldIndex >= 2) == charIFR.allFieldsFilled); 245 assert ((fieldIndex >= 2) == wcharIFR.allFieldsFilled); 246 assert ((fieldIndex >= 2) == dcharIFR.allFieldsFilled); 247 } 248 assert(charIFR.allFieldsFilled); 249 assert(wcharIFR.allFieldsFilled); 250 assert(dcharIFR.allFieldsFilled); 251 252 assert(charIFR.outputFields == charExpected_2_0[lineIndex]); 253 assert(wcharIFR.outputFields == wcharExpected_2_0[lineIndex]); 254 assert(dcharIFR.outputFields == dcharExpected_2_0[lineIndex]); 255 } 256 } 257 258 // InputFieldReordering - Test of partial line support. 259 @safe unittest 260 { 261 import std.conv : to; 262 263 auto inputLines = [["r1f0", "r1f1", "r1f2", "r1f3"], 264 ["r2f0", "abc", "ÀBCßßZ", "ghi"], 265 ["r3f0", "123", "456", "789"]]; 266 267 size_t[] fields_2_0 = [2, 0]; 268 269 // The expected states of the output field while each line and field are processed. 270 auto expectedBylineByfield_2_0 = 271 [ 272 [["", "r1f0"], ["", "r1f0"], ["r1f2", "r1f0"], ["r1f2", "r1f0"]], 273 [["", "r2f0"], ["", "r2f0"], ["ÀBCßßZ", "r2f0"], ["ÀBCßßZ", "r2f0"]], 274 [["", "r3f0"], ["", "r3f0"], ["456", "r3f0"], ["456", "r3f0"]], 275 ]; 276 277 char[][][][] charExpectedBylineByfield_2_0 = to!(char[][][][])(expectedBylineByfield_2_0); 278 279 auto charIFR = new InputFieldReordering!(char, EnablePartialLines.yes)(fields_2_0); 280 281 foreach (lineIndex, line; inputLines) 282 { 283 charIFR.initNewLine; 284 foreach (fieldIndex, fieldValue; line) 285 { 286 charIFR.processNextField(fieldIndex, to!(char[])(fieldValue)); 287 assert(charIFR.outputFields == charExpectedBylineByfield_2_0[lineIndex][fieldIndex]); 288 } 289 } 290 } 291 292 // InputFieldReordering - Field combination tests. 293 @safe unittest 294 { 295 import std.conv : to; 296 import std.stdio; 297 298 auto inputLines = [["00", "01", "02", "03"], 299 ["10", "11", "12", "13"], 300 ["20", "21", "22", "23"]]; 301 302 size_t[] fields_0 = [0]; 303 size_t[] fields_3 = [3]; 304 size_t[] fields_01 = [0, 1]; 305 size_t[] fields_10 = [1, 0]; 306 size_t[] fields_03 = [0, 3]; 307 size_t[] fields_30 = [3, 0]; 308 size_t[] fields_0123 = [0, 1, 2, 3]; 309 size_t[] fields_3210 = [3, 2, 1, 0]; 310 size_t[] fields_03001 = [0, 3, 0, 0, 1]; 311 312 auto expected_0 = to!(char[][][])([["00"], 313 ["10"], 314 ["20"]]); 315 316 auto expected_3 = to!(char[][][])([["03"], 317 ["13"], 318 ["23"]]); 319 320 auto expected_01 = to!(char[][][])([["00", "01"], 321 ["10", "11"], 322 ["20", "21"]]); 323 324 auto expected_10 = to!(char[][][])([["01", "00"], 325 ["11", "10"], 326 ["21", "20"]]); 327 328 auto expected_03 = to!(char[][][])([["00", "03"], 329 ["10", "13"], 330 ["20", "23"]]); 331 332 auto expected_30 = to!(char[][][])([["03", "00"], 333 ["13", "10"], 334 ["23", "20"]]); 335 336 auto expected_0123 = to!(char[][][])([["00", "01", "02", "03"], 337 ["10", "11", "12", "13"], 338 ["20", "21", "22", "23"]]); 339 340 auto expected_3210 = to!(char[][][])([["03", "02", "01", "00"], 341 ["13", "12", "11", "10"], 342 ["23", "22", "21", "20"]]); 343 344 auto expected_03001 = to!(char[][][])([["00", "03", "00", "00", "01"], 345 ["10", "13", "10", "10", "11"], 346 ["20", "23", "20", "20", "21"]]); 347 348 auto ifr_0 = new InputFieldReordering!char(fields_0); 349 auto ifr_3 = new InputFieldReordering!char(fields_3); 350 auto ifr_01 = new InputFieldReordering!char(fields_01); 351 auto ifr_10 = new InputFieldReordering!char(fields_10); 352 auto ifr_03 = new InputFieldReordering!char(fields_03); 353 auto ifr_30 = new InputFieldReordering!char(fields_30); 354 auto ifr_0123 = new InputFieldReordering!char(fields_0123); 355 auto ifr_3210 = new InputFieldReordering!char(fields_3210); 356 auto ifr_03001 = new InputFieldReordering!char(fields_03001); 357 358 foreach (lineIndex, line; inputLines) 359 { 360 ifr_0.initNewLine; 361 ifr_3.initNewLine; 362 ifr_01.initNewLine; 363 ifr_10.initNewLine; 364 ifr_03.initNewLine; 365 ifr_30.initNewLine; 366 ifr_0123.initNewLine; 367 ifr_3210.initNewLine; 368 ifr_03001.initNewLine; 369 370 foreach (fieldIndex, fieldValue; line) 371 { 372 ifr_0.processNextField(fieldIndex, to!(char[])(fieldValue)); 373 ifr_3.processNextField(fieldIndex, to!(char[])(fieldValue)); 374 ifr_01.processNextField(fieldIndex, to!(char[])(fieldValue)); 375 ifr_10.processNextField(fieldIndex, to!(char[])(fieldValue)); 376 ifr_03.processNextField(fieldIndex, to!(char[])(fieldValue)); 377 ifr_30.processNextField(fieldIndex, to!(char[])(fieldValue)); 378 ifr_0123.processNextField(fieldIndex, to!(char[])(fieldValue)); 379 ifr_3210.processNextField(fieldIndex, to!(char[])(fieldValue)); 380 ifr_03001.processNextField(fieldIndex, to!(char[])(fieldValue)); 381 } 382 383 assert(ifr_0.outputFields == expected_0[lineIndex]); 384 assert(ifr_3.outputFields == expected_3[lineIndex]); 385 assert(ifr_01.outputFields == expected_01[lineIndex]); 386 assert(ifr_10.outputFields == expected_10[lineIndex]); 387 assert(ifr_03.outputFields == expected_03[lineIndex]); 388 assert(ifr_30.outputFields == expected_30[lineIndex]); 389 assert(ifr_0123.outputFields == expected_0123[lineIndex]); 390 assert(ifr_3210.outputFields == expected_3210[lineIndex]); 391 assert(ifr_03001.outputFields == expected_03001[lineIndex]); 392 } 393 } 394 395 /** Flag accepted by input buffering ranges to indicate if data should be read using 396 line buffering. Input is read as soon as lines are available when line buffered mode 397 is used. 398 */ 399 alias LineBuffered = Flag!"lineBuffered"; 400 401 /** Flag accepted by input buffering ranges to indicate if the header line should be 402 read when opening a file. 403 */ 404 alias ReadHeader = Flag!"readHeader"; 405 406 /** 407 BufferedOutputRangeDefaults defines the parameter defaults used by 408 BufferedOutputRange. These can be passed to the BufferedOutputRange 409 constructor when mixing specific setting with defaults. 410 */ 411 enum BufferedOutputRangeDefaults 412 { 413 flushSize = 10240, 414 lineBufferedFlushSize = 1, 415 reserveSize = 11264, 416 maxSize = 4194304 417 } 418 419 /** 420 BufferedOutputRange is a performance enhancement over writing directly to an output 421 stream. It holds a File open for write or an OutputRange. Ouput is accumulated in an 422 internal buffer and written to the output stream as a block. 423 424 Writing to stdout is a key use case. BufferedOutputRange is often dramatically faster 425 than writing to stdout directly. This is especially noticable for outputs with short 426 lines, as it blocks many writes together in a single write. 427 428 The internal buffer is written to the output stream after flushSize has been reached. 429 This is checked at newline boundaries, when appendln is called or when put is called 430 with a single newline character. Other writes check maxSize, which is used to avoid 431 runaway buffers. 432 433 This scheme only flushes the internal buffer, it does not flush the output stream. 434 Use flush() to flush both the internal buffer and the output stream. Specify flushSize 435 as BufferedOutputRangeDefaults.lineBufferedFlushSize in the constructor to get line 436 buffering with immediate flushes to the output stream. 437 438 The output stream type must be provided as a template argument during construction. E.g. 439 ``` 440 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout) 441 ``` 442 443 BufferedOutputRange has a put method allowing it to be used an output range. It has a 444 number of other methods providing additional control. 445 446 Methods: 447 448 $(LIST 449 * `this(outputStream [, flushSize, reserveSize, maxSize])` - Constructor. Takes the 450 output stream, e.g. stdout. Other arguments are optional, defaults normally suffice. 451 452 * `this(outputStream, LineBuffered)` - Alternate constructor for turning line-buffered 453 mode on. 454 455 * `append(stuff)` - Append to the internal buffer. 456 457 * `appendln(stuff)` - Append to the internal buffer, followed by a newline. The buffer 458 is flushed to the output stream if is has reached flushSize. 459 460 * `appendln()` - Append a newline to the internal buffer. The buffer is flushed to the 461 output stream if is has reached flushSize. 462 463 * `joinAppend(inputRange, delim)` - An optimization of `append(inputRange.joiner(delim))`. 464 For reasons that are not clear, joiner is quite slow. 465 466 * `flush()` - Writes the internal buffer to the output stream and flush the output stream. 467 468 * `put(stuff)` - Appends to the internal buffer. Acts as `appendln()` if passed a single 469 newline character, '\n' or "\n". 470 471 * `flushBuffer()` - This flushes both the internal buffers and the output stream. 472 ) 473 474 The internal buffer is automatically flushed when the BufferedOutputRange goes out of 475 scope. 476 */ 477 struct BufferedOutputRange(OutputTarget) 478 if (isFileHandle!(Unqual!OutputTarget) || isOutputRange!(Unqual!OutputTarget, char)) 479 { 480 import std.array : appender; 481 import std.format : format; 482 483 /* Identify the output element type. Only supporting char and ubyte for now. */ 484 static if (isFileHandle!OutputTarget || isOutputRange!(OutputTarget, char)) 485 { 486 alias C = char; 487 } 488 else static if (isOutputRange!(OutputTarget, ubyte)) 489 { 490 alias C = ubyte; 491 } 492 else static assert(false); 493 494 private OutputTarget _outputTarget; 495 private auto _outputBuffer = appender!(C[]); 496 private immutable size_t _flushSize; 497 private immutable size_t _maxSize; 498 499 /** Constructor. Takes the output stream, e.g. stdout. Optional arguments control 500 * buffering behavior, defaults normally suffice. The defaults are available from 501 * the `BufferedOutputRangeDefault` enum. 502 */ 503 this(OutputTarget outputTarget, 504 size_t flushSize = BufferedOutputRangeDefaults.flushSize, 505 size_t reserveSize = BufferedOutputRangeDefaults.reserveSize, 506 size_t maxSize = BufferedOutputRangeDefaults.maxSize) 507 { 508 assert(flushSize <= maxSize); 509 510 _outputTarget = outputTarget; 511 _flushSize = flushSize; 512 _maxSize = (flushSize <= maxSize) ? maxSize : flushSize; 513 _outputBuffer.reserve(reserveSize); 514 } 515 516 /** Alternate constuctor used to turn line-buffered mode on. Use Yes.lineBuffered 517 * to enable. Lines are flushed at newline boundaries when in line-buffered mode. 518 */ 519 this(OutputTarget outputTarget, LineBuffered lineBuffered) 520 { 521 immutable size_t flushSize = lineBuffered ? 522 BufferedOutputRangeDefaults.lineBufferedFlushSize : 523 BufferedOutputRangeDefaults.flushSize; 524 525 this(outputTarget, flushSize); 526 } 527 528 ~this() 529 { 530 flush(); 531 } 532 533 private void flushBuffer() 534 { 535 static if (isFileHandle!OutputTarget) 536 { 537 _outputTarget.rawWrite(_outputBuffer.data); 538 539 if (_flushSize == BufferedOutputRangeDefaults.lineBufferedFlushSize) 540 { 541 _outputTarget.flush(); 542 } 543 } 544 else _outputTarget.put(_outputBuffer.data); 545 546 _outputBuffer.clear; 547 } 548 549 /** Writes the internal buffer to the output stream and flush the output stream. 550 */ 551 void flush() 552 { 553 flushBuffer(); 554 static if (isFileHandle!OutputTarget) _outputTarget.flush(); 555 } 556 557 /* flushIfFull flushes the internal buffer if flushSize has been reached. */ 558 private bool flushIfFull() 559 { 560 bool isFull = _outputBuffer.data.length >= _flushSize; 561 if (isFull) flushBuffer(); 562 return isFull; 563 } 564 565 /* flushIfMaxSize is a safety check to avoid runaway buffer growth. */ 566 private void flushIfMaxSize() 567 { 568 if (_outputBuffer.data.length >= _maxSize) flushBuffer(); 569 } 570 571 /* maybeFlush is intended for the case where put is called with a trailing newline. 572 * 573 * Flushing occurs if the buffer has a trailing newline and has reached flush size. 574 * Flushing also occurs if the buffer has reached max size. 575 */ 576 private bool maybeFlush() 577 { 578 immutable bool doFlush = 579 _outputBuffer.data.length >= _flushSize && 580 (_outputBuffer.data[$-1] == '\n' || _outputBuffer.data.length >= _maxSize); 581 582 if (doFlush) flush(); 583 return doFlush; 584 } 585 586 /** Appends data to the output buffer without checking for flush conditions. This 587 * is intended for cases where an `appendln` or `append` ending in newline will 588 * shortly follow. 589 */ 590 private void appendRaw(T)(T stuff) pure 591 { 592 import std.range : rangePut = put; 593 rangePut(_outputBuffer, stuff); 594 } 595 596 /** Appends data to the output buffer. The output buffer is flushed if the appended 597 * data ends in a newline and the output buffer has reached `flushSize`. 598 */ 599 void append(T...)(T stuff) 600 { 601 foreach (x; stuff) appendRaw(x); 602 maybeFlush(); 603 } 604 605 /** Appends data plus a newline to the output buffer. The output buffer is flushed 606 * if it has reached `flushSize`. 607 */ 608 bool appendln(T...)(T stuff) 609 { 610 foreach (x; stuff) appendRaw(x); 611 appendRaw('\n'); 612 return flushIfFull(); 613 } 614 615 /** joinAppend is an optimization of append(inputRange.joiner(delimiter). 616 * This form is quite a bit faster, 40%+ on some benchmarks. 617 */ 618 void joinAppend(InputRange, E)(InputRange inputRange, E delimiter) 619 if (isInputRange!InputRange && 620 is(ElementType!InputRange : const C[]) && 621 (is(E : const C[]) || is(E : const C))) 622 { 623 if (!inputRange.empty) 624 { 625 appendRaw(inputRange.front); 626 inputRange.popFront; 627 } 628 foreach (x; inputRange) 629 { 630 appendRaw(delimiter); 631 appendRaw(x); 632 } 633 flushIfMaxSize(); 634 } 635 636 /** The `put` method makes BufferOutputRange an OutputRange. It operates similarly 637 * to `append`. 638 */ 639 void put(T)(T stuff) 640 { 641 import std.traits; 642 import std.stdio; 643 644 static if (isSomeChar!T) 645 { 646 if (stuff == '\n') appendln(); 647 else appendRaw(stuff); 648 } 649 else static if (isSomeString!T) 650 { 651 if (stuff == "\n") appendln(); 652 else append(stuff); 653 } 654 else append(stuff); 655 } 656 } 657 658 // BufferedOutputRange. 659 unittest 660 { 661 import tsv_utils.common.unittest_utils; 662 import std.file : rmdirRecurse, readText; 663 import std.path : buildPath; 664 665 auto testDir = makeUnittestTempDir("tsv_utils_buffered_output"); 666 scope(exit) testDir.rmdirRecurse; 667 668 import std.algorithm : map, joiner; 669 import std.range : iota; 670 import std.conv : to; 671 672 /* Basic test. Note that exiting the scope triggers flush. */ 673 string filepath1 = buildPath(testDir, "file1.txt"); 674 { 675 import std.stdio : File; 676 677 auto ostream = BufferedOutputRange!File(filepath1.File("wb")); 678 ostream.append("file1: "); 679 ostream.append("abc"); 680 ostream.append(["def", "ghi", "jkl"]); 681 ostream.appendln(100.to!string); 682 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 683 ostream.appendln(); 684 ostream.appendln('A'); 685 ostream.appendln("B", "CD"); 686 ostream.appendln('E', "FG", 'H'); 687 ostream.appendln('I', "JK", 'L', "M"); 688 ostream.append('N', "O"); 689 ostream.append('P', "QR", "STU\n"); 690 } 691 assert(filepath1.readText == "file1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\nA\nBCD\nEFGH\nIJKLM\nNOPQRSTU\n"); 692 693 /* Test with no reserve and flush at every line. */ 694 string filepath2 = buildPath(testDir, "file2.txt"); 695 { 696 import std.stdio : File; 697 698 auto ostream = BufferedOutputRange!File(filepath2.File("wb"), 0, 0); 699 ostream.append("file2: "); 700 ostream.append("abc"); 701 ostream.append(["def", "ghi", "jkl"]); 702 ostream.appendln("100"); 703 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 704 ostream.appendln(); 705 ostream.appendln("X"); 706 } 707 assert(filepath2.readText == "file2: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\nX\n"); 708 709 /* Test default line-buffered mode (flush at every line). */ 710 string filepath2a = buildPath(testDir, "file2a.txt"); 711 { 712 import std.stdio : File; 713 714 auto ostream = BufferedOutputRange!File( 715 filepath2a.File("wb"), BufferedOutputRangeDefaults.lineBufferedFlushSize); 716 ostream.append("file2a: "); 717 ostream.append("abc"); 718 ostream.append(["def", "ghi", "jkl"]); 719 ostream.appendln("100"); 720 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 721 ostream.appendln(); 722 ostream.appendln("X"); 723 } 724 assert(filepath2a.readText == "file2a: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\nX\n"); 725 726 /* Test the alternate constructor. */ 727 static foreach (isLineBuffered; [Yes.lineBuffered, No.lineBuffered]) 728 {{ 729 string filepath2b = buildPath(testDir, "file2b.txt"); 730 { 731 import std.stdio : File; 732 733 auto ostream = BufferedOutputRange!File(filepath2b.File("wb"), isLineBuffered); 734 ostream.append("file2b: "); 735 ostream.append("abc"); 736 ostream.append(["def", "ghi", "jkl"]); 737 ostream.appendln("100"); 738 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 739 ostream.appendln(); 740 ostream.appendln("X"); 741 } 742 assert(filepath2b.readText == "file2b: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\nX\n"); 743 }} 744 745 /* With a locking text writer. Requires version 2.078.0 746 See: https://issues.dlang.org/show_bug.cgi?id=9661 747 */ 748 static if (__VERSION__ >= 2078) 749 { 750 string filepath3 = buildPath(testDir, "file3.txt"); 751 { 752 import std.stdio : File; 753 754 auto ltw = filepath3.File("wb").lockingTextWriter; 755 { 756 auto ostream = BufferedOutputRange!(typeof(ltw))(ltw); 757 ostream.append("file3: "); 758 ostream.append("abc"); 759 ostream.append(["def", "ghi", "jkl"]); 760 ostream.appendln("100"); 761 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 762 ostream.appendln(); 763 } 764 } 765 assert(filepath3.readText == "file3: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 766 } 767 768 /* With an Appender. */ 769 import std.array : appender; 770 auto app1 = appender!(char[]); 771 { 772 auto ostream = BufferedOutputRange!(typeof(app1))(app1); 773 ostream.append("appender1: "); 774 ostream.append("abc"); 775 ostream.append(["def", "ghi", "jkl"]); 776 ostream.appendln("100"); 777 ostream.append(iota(0, 10).map!(x => x.to!string).joiner(" ")); 778 ostream.appendln(); 779 } 780 assert(app1.data == "appender1: abcdefghijkl100\n0 1 2 3 4 5 6 7 8 9\n"); 781 782 /* With an Appender, but checking flush boundaries. */ 783 auto app2 = appender!(char[]); 784 { 785 auto ostream = BufferedOutputRange!(typeof(app2))(app2, 10, 0); // Flush if 10+ 786 bool wasFlushed = false; 787 788 assert(app2.data == ""); 789 790 ostream.append("12345678"); // Not flushed yet. 791 assert(app2.data == ""); 792 793 wasFlushed = ostream.appendln; // Nineth char, not flushed yet. 794 assert(!wasFlushed); 795 assert(app2.data == ""); 796 797 wasFlushed = ostream.appendln; // Tenth char, now flushed. 798 assert(wasFlushed); 799 assert(app2.data == "12345678\n\n"); 800 801 app2.clear; 802 assert(app2.data == ""); 803 804 ostream.append("12345678"); 805 806 wasFlushed = ostream.flushIfFull; 807 assert(!wasFlushed); 808 assert(app2.data == ""); 809 810 ostream.flush; 811 assert(app2.data == "12345678"); 812 813 app2.clear; 814 assert(app2.data == ""); 815 816 ostream.append("123456789012345"); 817 assert(app2.data == ""); 818 } 819 assert(app2.data == "123456789012345"); 820 821 /* Using joinAppend. */ 822 auto app1b = appender!(char[]); 823 { 824 auto ostream = BufferedOutputRange!(typeof(app1b))(app1b); 825 ostream.append("appenderB: "); 826 ostream.joinAppend(["a", "bc", "def"], '-'); 827 ostream.append(':'); 828 ostream.joinAppend(["g", "hi", "jkl"], '-'); 829 ostream.appendln("*100*"); 830 ostream.joinAppend(iota(0, 6).map!(x => x.to!string), ' '); 831 ostream.append(' '); 832 ostream.joinAppend(iota(6, 10).map!(x => x.to!string), " "); 833 ostream.appendln(); 834 } 835 assert(app1b.data == "appenderB: a-bc-def:g-hi-jkl*100*\n0 1 2 3 4 5 6 7 8 9\n", 836 "app1b.data: |" ~app1b.data ~ "|"); 837 838 /* Operating as an output range. When passed to a function as a ref, exiting 839 * the function does not flush. When passed as a value, it get flushed when 840 * the function returns. Also test both UCFS and non-UFCS styles. 841 */ 842 843 void outputStuffAsRef(T)(ref T range) 844 if (isOutputRange!(T, char)) 845 { 846 range.put('1'); 847 put(range, "23"); 848 range.put('\n'); 849 range.put(["5", "67"]); 850 put(range, iota(8, 10).map!(x => x.to!string)); 851 put(range, "\n"); 852 } 853 854 void outputStuffAsVal(T)(T range) 855 if (isOutputRange!(T, char)) 856 { 857 put(range, '1'); 858 range.put("23"); 859 put(range, '\n'); 860 put(range, ["5", "67"]); 861 range.put(iota(8, 10).map!(x => x.to!string)); 862 range.put("\n"); 863 } 864 865 auto app3 = appender!(char[]); 866 { 867 auto ostream = BufferedOutputRange!(typeof(app3))(app3, 12, 0); 868 outputStuffAsRef(ostream); 869 assert(app3.data == "", "app3.data: |" ~app3.data ~ "|"); 870 outputStuffAsRef(ostream); 871 assert(app3.data == "123\n56789\n123\n", "app3.data: |" ~app3.data ~ "|"); 872 } 873 assert(app3.data == "123\n56789\n123\n56789\n", "app3.data: |" ~app3.data ~ "|"); 874 875 auto app4 = appender!(char[]); 876 { 877 auto ostream = BufferedOutputRange!(typeof(app4))(app4, 12, 0); 878 outputStuffAsVal(ostream); 879 assert(app4.data == "123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 880 outputStuffAsVal(ostream); 881 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 882 } 883 assert(app4.data == "123\n56789\n123\n56789\n", "app4.data: |" ~app4.data ~ "|"); 884 885 /* Test maxSize. */ 886 auto app5 = appender!(char[]); 887 { 888 auto ostream = BufferedOutputRange!(typeof(app5))(app5, 5, 0, 10); // maxSize 10 889 assert(app5.data == ""); 890 891 ostream.append("1234567"); // Not flushed yet (no newline). 892 assert(app5.data == ""); 893 894 ostream.append("89012"); // Flushed by maxSize 895 assert(app5.data == "123456789012"); 896 897 ostream.put("1234567"); // Not flushed yet (no newline). 898 assert(app5.data == "123456789012"); 899 900 ostream.put("89012"); // Flushed by maxSize 901 assert(app5.data == "123456789012123456789012"); 902 903 ostream.joinAppend(["ab", "cd"], '-'); // Not flushed yet 904 ostream.joinAppend(["de", "gh", "ij"], '-'); // Flushed by maxSize 905 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 906 } 907 assert(app5.data == "123456789012123456789012ab-cdde-gh-ij"); 908 } 909 910 /** 911 isFlushableOutputRange returns true if R is an output range with a flush member. 912 */ 913 enum bool isFlushableOutputRange(R, E=char) = isOutputRange!(R, E) 914 && is(ReturnType!((R r) => r.flush) == void); 915 916 @safe unittest 917 { 918 import std.array; 919 auto app = appender!(char[]); 920 auto ostream = BufferedOutputRange!(typeof(app))(app, 5, 0, 10); // maxSize 10 921 922 static assert(isOutputRange!(typeof(app), char)); 923 static assert(!isFlushableOutputRange!(typeof(app), char)); 924 static assert(!isFlushableOutputRange!(typeof(app))); 925 926 static assert(isOutputRange!(typeof(ostream), char)); 927 static assert(isFlushableOutputRange!(typeof(ostream), char)); 928 static assert(isFlushableOutputRange!(typeof(ostream))); 929 930 static assert(isOutputRange!(Appender!string, string)); 931 static assert(!isFlushableOutputRange!(Appender!string, string)); 932 static assert(!isFlushableOutputRange!(Appender!string)); 933 934 static assert(isOutputRange!(Appender!(char[]), char)); 935 static assert(!isFlushableOutputRange!(Appender!(char[]), char)); 936 static assert(!isFlushableOutputRange!(Appender!(char[]))); 937 938 static assert(isOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 939 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])))); 940 static assert(isFlushableOutputRange!(BufferedOutputRange!(Appender!(char[])), char)); 941 } 942 943 /** 944 bufferedByLine is a performance enhancement over std.stdio.File.byLine. It works by 945 reading a large buffer from the input stream rather than just a single line. 946 947 The file argument needs to be a File object open for reading, typically a filesystem 948 file or standard input. Use the Yes.keepTerminator template parameter to keep the 949 newline. This is similar to stdio.File.byLine, except specified as a template paramter 950 rather than a runtime parameter. 951 952 Reading in blocks does mean that input is not read until a full buffer is available or 953 end-of-file is reached. Reading each line as it is available can be enabled by setting 954 the lineBuffered parameter to Yes.lineBuffered. In this case bufferedByLine behaves 955 like stdio.File.byLine. 956 957 As a separate option, the first line of the file can be read as soon as it is available, 958 without waiting for a complete buffer. This is useful for reading the header line before 959 the rest of the data is available. Set the readHeader parameter to Yes.readHeader to get 960 this behavior. 961 */ 962 963 auto bufferedByLine(KeepTerminator keepTerminator = No.keepTerminator, Char = char, 964 ubyte terminator = '\n', size_t readSize = 1024 * 128, size_t growSize = 1024 * 16) 965 (File file, LineBuffered lineBuffered = No.lineBuffered, ReadHeader readHeader = No.readHeader) 966 if (is(Char == char) || is(Char == ubyte)) 967 { 968 static assert(0 < growSize && growSize <= readSize); 969 970 static final class BufferedByLineImpl 971 { 972 /* Buffer state variables 973 * - _buffer.length - Full length of allocated buffer. 974 * - _dataEnd - End of currently valid data (end of last read). 975 * - _lineStart - Start of current line. 976 * - _lineEnd - End of current line. 977 */ 978 private File _file; 979 private immutable LineBuffered _lineBuffered; 980 private ubyte[] _buffer; 981 private size_t _lineStart = 0; 982 private size_t _lineEnd = 0; 983 private size_t _dataEnd = 0; 984 985 this (File f, LineBuffered lineBuffered, ReadHeader readHeader) 986 { 987 _file = f; 988 _lineBuffered = lineBuffered; 989 _buffer = new ubyte[readSize + growSize]; 990 991 if (!_file.eof) 992 { 993 if (readHeader) popFrontLineBuffered(); 994 else popFront(); 995 } 996 } 997 998 bool empty() const pure 999 { 1000 return _file.eof && _lineStart == _dataEnd; 1001 } 1002 1003 Char[] front() pure 1004 { 1005 assert(!empty, "Attempt to take the front of an empty bufferedByLine."); 1006 1007 static if (keepTerminator == Yes.keepTerminator) 1008 { 1009 return cast(Char[]) _buffer[_lineStart .. _lineEnd]; 1010 } 1011 else 1012 { 1013 assert(_lineStart < _lineEnd); 1014 immutable end = (_buffer[_lineEnd - 1] == terminator) ? _lineEnd - 1 : _lineEnd; 1015 return cast(Char[]) _buffer[_lineStart .. end]; 1016 } 1017 } 1018 1019 void popFront() 1020 { 1021 assert(!empty, "Attempt to popFront an empty bufferedByLine."); 1022 1023 if (!_lineBuffered) popFrontFullBuffered(); 1024 else popFrontLineBuffered(); 1025 } 1026 1027 /* Discards the current line and reads the next line with File.readln. 1028 * Intended for use when reading in line-buffered mode. However, it is 1029 * also used to read in the header line when in full-buffered mode. 1030 */ 1031 private void popFrontLineBuffered() 1032 { 1033 assert(_lineEnd == _dataEnd); 1034 assert(!empty, "Attempt to popFront (LineBuffered) an empty bufferedByLine."); 1035 1036 char[] line = cast(char[]) _buffer; 1037 _lineStart = 0; 1038 _lineEnd = _dataEnd = _file.readln(line); 1039 if (line.length > _buffer.length) _buffer = cast(ubyte[]) line; 1040 1041 assert(_lineEnd == line.length); 1042 assert(_dataEnd == line.length); 1043 } 1044 1045 private void popFrontFullBuffered() 1046 { 1047 import std.algorithm: copy, find; 1048 1049 assert(!empty, "Attempt to popFront (Full Buffered) an empty bufferedByLine."); 1050 1051 /* Pop the current line. */ 1052 _lineStart = _lineEnd; 1053 1054 /* Set up the next line if more data is available, either in the buffer or 1055 * the file. The next line ends at the next newline, if there is one. 1056 * 1057 * Notes: 1058 * - 'find' returns the slice starting with the character searched for, or 1059 * an empty range if not found. 1060 * - _lineEnd is set to _dataEnd both when the current buffer does not have 1061 * a newline and when it ends with one. 1062 */ 1063 auto found = _buffer[_lineStart .. _dataEnd].find(terminator); 1064 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 1065 1066 if (found.empty && !_file.eof) 1067 { 1068 /* No newline in current buffer. Read from the file until the next 1069 * newline is found. 1070 */ 1071 assert(_lineEnd == _dataEnd); 1072 1073 if (_lineStart > 0) 1074 { 1075 /* Move remaining data to the start of the buffer. */ 1076 immutable remainingLength = _dataEnd - _lineStart; 1077 copy(_buffer[_lineStart .. _dataEnd], _buffer[0 .. remainingLength]); 1078 _lineStart = 0; 1079 _lineEnd = _dataEnd = remainingLength; 1080 } 1081 1082 do 1083 { 1084 /* Grow the buffer if necessary. */ 1085 immutable availableSize = _buffer.length - _dataEnd; 1086 if (availableSize < readSize) 1087 { 1088 size_t growBy = growSize; 1089 while (availableSize + growBy < readSize) growBy += growSize; 1090 _buffer.length += growBy; 1091 } 1092 1093 /* Read the next block. */ 1094 _dataEnd += 1095 _file.rawRead(_buffer[_dataEnd .. _dataEnd + readSize]) 1096 .length; 1097 1098 found = _buffer[_lineEnd .. _dataEnd].find(terminator); 1099 _lineEnd = found.empty ? _dataEnd : _dataEnd - found.length + 1; 1100 1101 } while (found.empty && !_file.eof); 1102 } 1103 } 1104 } 1105 1106 assert(file.isOpen, "bufferedByLine passed a closed file."); 1107 1108 return new BufferedByLineImpl(file, lineBuffered, readHeader); 1109 } 1110 1111 // BufferedByLine. 1112 unittest 1113 { 1114 import std.array : appender; 1115 import std.conv : to; 1116 import std.file : rmdirRecurse, readText; 1117 import std.path : buildPath; 1118 import std.range : lockstep; 1119 import std.stdio; 1120 import tsv_utils.common.unittest_utils; 1121 1122 auto testDir = makeUnittestTempDir("tsv_utils_buffered_byline"); 1123 scope(exit) testDir.rmdirRecurse; 1124 1125 /* Create three data files with the same data. Read ech in parallel with byLine and 1126 * bufferedByLine and compare each line. bufferedByLine is run in both full buffered 1127 * and line buffered modes. 1128 */ 1129 auto data1 = appender!(char[])(); 1130 1131 foreach (i; 1 .. 1001) data1.put('\n'); 1132 foreach (i; 1 .. 1001) data1.put("a\n"); 1133 foreach (i; 1 .. 1001) { data1.put(i.to!string); data1.put('\n'); } 1134 foreach (i; 1 .. 1001) 1135 { 1136 foreach (j; 1 .. i+1) data1.put('x'); 1137 data1.put('\n'); 1138 } 1139 1140 string file1a = buildPath(testDir, "file1a.txt"); 1141 string file1b = buildPath(testDir, "file1b.txt"); 1142 string file1c = buildPath(testDir, "file1c.txt"); 1143 string file1d = buildPath(testDir, "file1d.txt"); 1144 string file1e = buildPath(testDir, "file1e.txt"); 1145 1146 foreach (f; [file1a, file1b, file1c, file1d, file1e]) 1147 { 1148 auto fh = f.File("wb"); 1149 fh.write(data1.data); 1150 fh.close; 1151 } 1152 1153 /* Default buffer sizes */ 1154 static foreach (keepTerm; [No.keepTerminator, Yes.keepTerminator]) 1155 {{ 1156 auto f1aFH = file1a.File(); 1157 auto f1bFH = file1b.File(); 1158 auto f1cFH = file1c.File(); 1159 auto f1dFH = file1d.File(); 1160 auto f1eFH = file1e.File(); 1161 1162 auto f1aIn = f1aFH.byLine(keepTerm); 1163 auto f1bIn = f1bFH.bufferedByLine!(keepTerm); 1164 auto f1cIn = f1cFH.bufferedByLine!(keepTerm)(Yes.lineBuffered); 1165 auto f1dIn = f1dFH.bufferedByLine!(keepTerm)(No.lineBuffered, Yes.readHeader); 1166 auto f1eIn = f1eFH.bufferedByLine!(keepTerm)(Yes.lineBuffered, Yes.readHeader); 1167 1168 foreach (a, b, c, d, e; lockstep(f1aIn, f1bIn, f1cIn, f1dIn, f1eIn, StoppingPolicy.requireSameLength)) 1169 { 1170 assert(a == b); 1171 assert(a == c); 1172 assert(a == d); 1173 assert(a == e); 1174 } 1175 1176 f1aFH.close; 1177 f1bFH.close; 1178 f1cFH.close; 1179 f1dFH.close; 1180 f1eFH.close; 1181 }} 1182 1183 /* Smaller read size. This will trigger buffer growth. */ 1184 static foreach (keepTerm; [No.keepTerminator, Yes.keepTerminator]) 1185 {{ 1186 auto f1aFH = file1a.File(); 1187 auto f1bFH = file1b.File(); 1188 auto f1cFH = file1c.File(); 1189 auto f1dFH = file1d.File(); 1190 auto f1eFH = file1e.File(); 1191 1192 auto f1aIn = f1aFH.byLine(keepTerm); 1193 auto f1bIn = f1bFH.bufferedByLine!(keepTerm, char, '\n', 512, 256); 1194 auto f1cIn = f1cFH.bufferedByLine!(keepTerm, char, '\n', 512, 256)(Yes.lineBuffered); 1195 auto f1dIn = f1dFH.bufferedByLine!(keepTerm, char, '\n', 512, 256)(No.lineBuffered, Yes.readHeader); 1196 auto f1eIn = f1eFH.bufferedByLine!(keepTerm, char, '\n', 512, 256)(Yes.lineBuffered, Yes.readHeader); 1197 1198 foreach (a, b, c, d, e; lockstep(f1aIn, f1bIn, f1cIn, f1dIn, f1eIn, StoppingPolicy.requireSameLength)) 1199 { 1200 assert(a == b); 1201 assert(a == c); 1202 assert(a == d); 1203 assert(a == e); 1204 } 1205 1206 f1aFH.close; 1207 f1bFH.close; 1208 f1cFH.close; 1209 f1dFH.close; 1210 f1eFH.close; 1211 }} 1212 1213 /* Exercise boundary cases in buffer growth. */ 1214 static foreach (keepTerm; [No.keepTerminator, Yes.keepTerminator]) 1215 { 1216 static foreach (readSize; [1, 2, 4]) 1217 { 1218 static foreach (growSize; 1 .. readSize + 1) 1219 {{ 1220 auto f1aFH = file1a.File(); 1221 auto f1bFH = file1b.File(); 1222 auto f1cFH = file1c.File(); 1223 auto f1dFH = file1d.File(); 1224 auto f1eFH = file1e.File(); 1225 1226 auto f1aIn = f1aFH.byLine(keepTerm); 1227 auto f1bIn = f1bFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize); 1228 auto f1cIn = f1cFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered); 1229 auto f1dIn = f1dFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered, Yes.readHeader); 1230 auto f1eIn = f1eFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered, Yes.readHeader); 1231 1232 foreach (a, b, c, d, e; lockstep(f1aIn, f1bIn, f1cIn, f1dIn, f1eIn, StoppingPolicy.requireSameLength)) 1233 { 1234 assert(a == b); 1235 assert(a == c); 1236 assert(a == d); 1237 assert(a == e); 1238 } 1239 1240 f1aFH.close; 1241 f1bFH.close; 1242 f1cFH.close; 1243 f1dFH.close; 1244 f1eFH.close; 1245 }} 1246 } 1247 } 1248 1249 /* Files that do not end in a newline. */ 1250 1251 string file2a = buildPath(testDir, "file2a.txt"); 1252 string file2b = buildPath(testDir, "file2b.txt"); 1253 string file2c = buildPath(testDir, "file2c.txt"); 1254 string file2d = buildPath(testDir, "file2d.txt"); 1255 string file2e = buildPath(testDir, "file2e.txt"); 1256 string file3a = buildPath(testDir, "file3a.txt"); 1257 string file3b = buildPath(testDir, "file3b.txt"); 1258 string file3c = buildPath(testDir, "file3c.txt"); 1259 string file3d = buildPath(testDir, "file3d.txt"); 1260 string file3e = buildPath(testDir, "file3e.txt"); 1261 1262 foreach (f; [file1a, file1b, file1c, file1d, file1e]) 1263 { 1264 auto fh = f.File("wb"); 1265 fh.write("a"); 1266 fh.close; 1267 } 1268 1269 foreach (f; [file2a, file2b, file2c, file2d, file2e]) 1270 { 1271 auto fh = f.File("wb"); 1272 fh.write("ab"); 1273 fh.close; 1274 } 1275 1276 foreach (f; [file3a, file3b, file3c, file3d, file3e]) 1277 { 1278 auto fh = f.File("wb"); 1279 fh.write("abc"); 1280 fh.close; 1281 } 1282 1283 static foreach (keepTerm; [No.keepTerminator, Yes.keepTerminator]) 1284 { 1285 static foreach (readSize; [1, 2, 4]) 1286 { 1287 static foreach (growSize; 1 .. readSize + 1) 1288 {{ 1289 auto f1aFH = file1a.File(); 1290 auto f1bFH = file1b.File(); 1291 auto f1cFH = file1c.File(); 1292 auto f1dFH = file1d.File(); 1293 auto f1eFH = file1e.File(); 1294 1295 auto f1aIn = f1aFH.byLine(keepTerm); 1296 auto f1bIn = f1bFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered); 1297 auto f1cIn = f1cFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered); 1298 auto f1dIn = f1dFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered, Yes.readHeader); 1299 auto f1eIn = f1eFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered, Yes.readHeader); 1300 1301 foreach (a, b, c, d, e; lockstep(f1aIn, f1bIn, f1cIn, f1dIn, f1eIn, StoppingPolicy.requireSameLength)) 1302 { 1303 assert(a == b); 1304 assert(a == c); 1305 assert(a == d); 1306 assert(a == e); 1307 } 1308 1309 f1aFH.close; 1310 f1bFH.close; 1311 f1cFH.close; 1312 f1dFH.close; 1313 f1eFH.close; 1314 1315 auto f2aFH = file2a.File(); 1316 auto f2bFH = file2b.File(); 1317 auto f2cFH = file2c.File(); 1318 auto f2dFH = file2d.File(); 1319 auto f2eFH = file2e.File(); 1320 1321 auto f2aIn = f2aFH.byLine(keepTerm); 1322 auto f2bIn = f2bFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered); 1323 auto f2cIn = f2cFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered); 1324 auto f2dIn = f2dFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered, Yes.readHeader); 1325 auto f2eIn = f2eFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered, Yes.readHeader); 1326 1327 foreach (a, b, c, d, e; lockstep(f2aIn, f2bIn, f2cIn, f2dIn, f2eIn, StoppingPolicy.requireSameLength)) 1328 { 1329 assert(a == b); 1330 assert(a == c); 1331 assert(a == d); 1332 assert(a == e); 1333 } 1334 1335 f2aFH.close; 1336 f2bFH.close; 1337 f2cFH.close; 1338 f2dFH.close; 1339 f2eFH.close; 1340 1341 auto f3aFH = file3a.File(); 1342 auto f3bFH = file3b.File(); 1343 auto f3cFH = file3c.File(); 1344 auto f3dFH = file3d.File(); 1345 auto f3eFH = file3e.File(); 1346 1347 auto f3aIn = f3aFH.byLine(keepTerm); 1348 auto f3bIn = f3bFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered); 1349 auto f3cIn = f3cFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered); 1350 auto f3dIn = f3dFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(No.lineBuffered, Yes.readHeader); 1351 auto f3eIn = f3eFH.bufferedByLine!(keepTerm, char, '\n', readSize, growSize)(Yes.lineBuffered, Yes.readHeader); 1352 1353 foreach (a, b, c, d, e; lockstep(f3aIn, f3bIn, f3cIn, f3dIn, f3eIn, StoppingPolicy.requireSameLength)) 1354 { 1355 assert(a == b); 1356 assert(a == c); 1357 assert(a == d); 1358 assert(a == e); 1359 } 1360 1361 f3aFH.close; 1362 f3bFH.close; 1363 f3cFH.close; 1364 f3dFH.close; 1365 f3eFH.close; 1366 }} 1367 } 1368 } 1369 } 1370 1371 /** 1372 joinAppend performs a join operation on an input range, appending the results to 1373 an output range. 1374 1375 joinAppend was written as a performance enhancement over using std.algorithm.joiner 1376 or std.array.join with writeln. Using joiner with writeln is quite slow, 3-4x slower 1377 than std.array.join with writeln. The joiner performance may be due to interaction 1378 with writeln, this was not investigated. Using joiner with stdout.lockingTextWriter 1379 is better, but still substantially slower than join. Using join works reasonably well, 1380 but is allocating memory unnecessarily. 1381 1382 Using joinAppend with Appender is a bit faster than join, and allocates less memory. 1383 The Appender re-uses the underlying data buffer, saving memory. The example below 1384 illustrates. It is a modification of the InputFieldReordering example. The role 1385 Appender plus joinAppend are playing is to buffer the output. BufferedOutputRange 1386 uses a similar technique to buffer multiple lines. 1387 1388 Note: The original uses joinAppend have been replaced by BufferedOutputRange, which has 1389 its own joinAppend method. However, joinAppend remains useful when constructing internal 1390 buffers where BufferedOutputRange is not appropriate. 1391 1392 --- 1393 int main(string[] args) 1394 { 1395 import tsvutil; 1396 import std.algorithm, std.array, std.range, std.stdio; 1397 size_t[] fieldIndicies = [3, 0, 2]; 1398 auto fieldReordering = new InputFieldReordering!char(fieldIndicies); 1399 auto outputBuffer = appender!(char[]); 1400 foreach (line; stdin.byLine) 1401 { 1402 fieldReordering.initNewLine; 1403 foreach(fieldIndex, fieldValue; line.splitter('\t').enumerate) 1404 { 1405 fieldReordering.processNextField(fieldIndex, fieldValue); 1406 if (fieldReordering.allFieldsFilled) break; 1407 } 1408 if (fieldReordering.allFieldsFilled) 1409 { 1410 outputBuffer.clear; 1411 writeln(fieldReordering.outputFields.joinAppend(outputBuffer, ('\t'))); 1412 } 1413 else 1414 { 1415 writeln("Error: Insufficient number of field on the line."); 1416 } 1417 } 1418 return 0; 1419 } 1420 --- 1421 */ 1422 OutputRange joinAppend(InputRange, OutputRange, E) 1423 (InputRange inputRange, ref OutputRange outputRange, E delimiter) 1424 if (isInputRange!InputRange && 1425 (is(ElementType!InputRange : const E[]) && 1426 isOutputRange!(OutputRange, E[])) 1427 || 1428 (is(ElementType!InputRange : const E) && 1429 isOutputRange!(OutputRange, E)) 1430 ) 1431 { 1432 if (!inputRange.empty) 1433 { 1434 outputRange.put(inputRange.front); 1435 inputRange.popFront; 1436 } 1437 foreach (x; inputRange) 1438 { 1439 outputRange.put(delimiter); 1440 outputRange.put(x); 1441 } 1442 return outputRange; 1443 } 1444 1445 // joinAppend. 1446 @safe unittest 1447 { 1448 import std.array : appender; 1449 import std.algorithm : equal; 1450 1451 char[] c1 = ['a', 'b', 'c']; 1452 char[] c2 = ['d', 'e', 'f']; 1453 char[] c3 = ['g', 'h', 'i']; 1454 auto cvec = [c1, c2, c3]; 1455 1456 auto s1 = "abc"; 1457 auto s2 = "def"; 1458 auto s3 = "ghi"; 1459 auto svec = [s1, s2, s3]; 1460 1461 auto charAppender = appender!(char[])(); 1462 1463 assert(cvec.joinAppend(charAppender, '_').data == "abc_def_ghi"); 1464 assert(equal(cvec, [c1, c2, c3])); 1465 1466 charAppender.put('$'); 1467 assert(svec.joinAppend(charAppender, '|').data == "abc_def_ghi$abc|def|ghi"); 1468 assert(equal(cvec, [s1, s2, s3])); 1469 1470 charAppender.clear; 1471 assert(svec.joinAppend(charAppender, '|').data == "abc|def|ghi"); 1472 1473 auto intAppender = appender!(int[])(); 1474 1475 auto i1 = [100, 101, 102]; 1476 auto i2 = [200, 201, 202]; 1477 auto i3 = [300, 301, 302]; 1478 auto ivec = [i1, i2, i3]; 1479 1480 assert(ivec.joinAppend(intAppender, 0).data == 1481 [100, 101, 102, 0, 200, 201, 202, 0, 300, 301, 302]); 1482 1483 intAppender.clear; 1484 assert(i1.joinAppend(intAppender, 0).data == 1485 [100, 0, 101, 0, 102]); 1486 assert(i2.joinAppend(intAppender, 1).data == 1487 [100, 0, 101, 0, 102, 1488 200, 1, 201, 1, 202]); 1489 assert(i3.joinAppend(intAppender, 2).data == 1490 [100, 0, 101, 0, 102, 1491 200, 1, 201, 1, 202, 1492 300, 2, 301, 2, 302]); 1493 } 1494 1495 /** 1496 getTsvFieldValue extracts the value of a single field from a delimited text string. 1497 1498 This is a convenience function intended for cases when only a single field from an 1499 input line is needed. If multiple values are needed, it will be more efficient to 1500 work directly with std.algorithm.splitter or the InputFieldReordering class. 1501 1502 The input text is split by a delimiter character. The specified field is converted 1503 to the desired type and the value returned. 1504 1505 An exception is thrown if there are not enough fields on the line or if conversion 1506 fails. Conversion is done with std.conv.to, it throws a std.conv.ConvException on 1507 failure. If not enough fields, the exception text is generated referencing 1-upped 1508 field numbers as would be provided by command line users. 1509 */ 1510 T getTsvFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim) 1511 if (isSomeChar!C) 1512 { 1513 import std.algorithm : splitter; 1514 import std.conv : to; 1515 import std.format : format; 1516 import std.range; 1517 1518 auto splitLine = line.splitter(delim); 1519 size_t atField = 0; 1520 1521 while (atField < fieldIndex && !splitLine.empty) 1522 { 1523 splitLine.popFront; 1524 atField++; 1525 } 1526 1527 T val; 1528 if (splitLine.empty) 1529 { 1530 if (fieldIndex == 0) 1531 { 1532 /* This is a workaround to a splitter special case - If the input is empty, 1533 * the returned split range is empty. This doesn't properly represent a single 1534 * column file. More correct mathematically, and for this case, would be a 1535 * single value representing an empty string. The input line is a convenient 1536 * source of an empty line. Info: 1537 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 1538 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 1539 */ 1540 assert(line.empty); 1541 val = line.to!T; 1542 } 1543 else 1544 { 1545 throw new Exception( 1546 format("Not enough fields on line. Number required: %d; Number found: %d", 1547 fieldIndex + 1, atField)); 1548 } 1549 } 1550 else 1551 { 1552 val = splitLine.front.to!T; 1553 } 1554 1555 return val; 1556 } 1557 1558 // getTsvFieldValue. 1559 @safe unittest 1560 { 1561 import std.conv : ConvException, to; 1562 import std.exception; 1563 1564 /* Common cases. */ 1565 assert(getTsvFieldValue!double("123", 0, '\t') == 123.0); 1566 assert(getTsvFieldValue!double("-10.5", 0, '\t') == -10.5); 1567 assert(getTsvFieldValue!size_t("abc|123", 1, '|') == 123); 1568 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1569 assert(getTsvFieldValue!int("紅\t红\t99", 2, '\t') == 99); 1570 assert(getTsvFieldValue!string("紅\t红\t99", 2, '\t') == "99"); 1571 assert(getTsvFieldValue!string("紅\t红\t99", 1, '\t') == "红"); 1572 assert(getTsvFieldValue!string("紅\t红\t99", 0, '\t') == "紅"); 1573 assert(getTsvFieldValue!string("红色和绿色\tred and green\t赤と緑\t10.5", 2, '\t') == "赤と緑"); 1574 assert(getTsvFieldValue!double("红色和绿色\tred and green\t赤と緑\t10.5", 3, '\t') == 10.5); 1575 1576 /* The empty field cases. */ 1577 assert(getTsvFieldValue!string("", 0, '\t') == ""); 1578 assert(getTsvFieldValue!string("\t", 0, '\t') == ""); 1579 assert(getTsvFieldValue!string("\t", 1, '\t') == ""); 1580 assert(getTsvFieldValue!string("", 0, ':') == ""); 1581 assert(getTsvFieldValue!string(":", 0, ':') == ""); 1582 assert(getTsvFieldValue!string(":", 1, ':') == ""); 1583 1584 /* Tests with different data types. */ 1585 string stringLine = "orange and black\tნარინჯისფერი და შავი\t88.5"; 1586 char[] charLine = "orange and black\tნარინჯისფერი და შავი\t88.5".to!(char[]); 1587 dchar[] dcharLine = stringLine.to!(dchar[]); 1588 wchar[] wcharLine = stringLine.to!(wchar[]); 1589 1590 assert(getTsvFieldValue!string(stringLine, 0, '\t') == "orange and black"); 1591 assert(getTsvFieldValue!string(stringLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1592 assert(getTsvFieldValue!wstring(stringLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1593 assert(getTsvFieldValue!double(stringLine, 2, '\t') == 88.5); 1594 1595 assert(getTsvFieldValue!string(charLine, 0, '\t') == "orange and black"); 1596 assert(getTsvFieldValue!string(charLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1597 assert(getTsvFieldValue!wstring(charLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1598 assert(getTsvFieldValue!double(charLine, 2, '\t') == 88.5); 1599 1600 assert(getTsvFieldValue!string(dcharLine, 0, '\t') == "orange and black"); 1601 assert(getTsvFieldValue!string(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1602 assert(getTsvFieldValue!wstring(dcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1603 assert(getTsvFieldValue!double(dcharLine, 2, '\t') == 88.5); 1604 1605 assert(getTsvFieldValue!string(wcharLine, 0, '\t') == "orange and black"); 1606 assert(getTsvFieldValue!string(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი"); 1607 assert(getTsvFieldValue!wstring(wcharLine, 1, '\t') == "ნარინჯისფერი და შავი".to!wstring); 1608 assert(getTsvFieldValue!double(wcharLine, 2, '\t') == 88.5); 1609 1610 /* Conversion errors. */ 1611 assertThrown!ConvException(getTsvFieldValue!double("", 0, '\t')); 1612 assertThrown!ConvException(getTsvFieldValue!double("abc", 0, '|')); 1613 assertThrown!ConvException(getTsvFieldValue!size_t("-1", 0, '|')); 1614 assertThrown!ConvException(getTsvFieldValue!size_t("a23|23.4", 1, '|')); 1615 assertThrown!ConvException(getTsvFieldValue!double("23.5|def", 1, '|')); 1616 1617 /* Not enough field errors. These should throw, but not a ConvException.*/ 1618 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("", 1, '\t'))); 1619 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc", 1, '\t'))); 1620 assertThrown(assertNotThrown!ConvException(getTsvFieldValue!double("abc\tdef", 2, '\t'))); 1621 } 1622 1623 /** 1624 Yes|No.newlineWasRemoved is a template parameter to throwIfWindowsNewline. A Yes 1625 value indicates the Unix newline was already removed, as might be done via 1626 std.File.byLine or similar mechanism. 1627 */ 1628 alias NewlineWasRemoved = Flag!"newlineWasRemoved"; 1629 1630 /** 1631 throwIfWindowsLineNewline throws an exception if the 'line' argument ends with a 1632 Windows/DOS line ending. This is used by TSV Utilities tools to detect Window/DOS 1633 line endings and terminate processing with an error message to the user. 1634 1635 The 'nlWasRemoved' template parameter can be used if a Unix newline character was 1636 already removed. In this case the CR character from a Windows CRLF remains and can be 1637 detected. This is useful when reading files in binary mode, stripping Unix newlines. 1638 */ 1639 void throwIfWindowsNewline 1640 (NewlineWasRemoved nlWasRemoved = Yes.newlineWasRemoved) 1641 (const char[] line, const char[] filename, size_t lineNum) 1642 { 1643 static if (nlWasRemoved) 1644 { 1645 immutable bool hasWindowsLineEnding = line.length != 0 && line[$ - 1] == '\r'; 1646 } 1647 else 1648 { 1649 immutable bool hasWindowsLineEnding = 1650 line.length > 1 && 1651 line[$ - 2] == '\r' && 1652 line[$ - 1] == '\n'; 1653 } 1654 1655 if (hasWindowsLineEnding) 1656 { 1657 import std.format; 1658 throw new Exception( 1659 format("Windows/DOS line ending found. Convert file to Unix newlines before processing (e.g. 'dos2unix').\n File: %s, Line: %s", 1660 (filename == "-") ? "Standard Input" : filename, lineNum)); 1661 } 1662 } 1663 1664 // throwIfWindowsNewline 1665 @safe unittest 1666 { 1667 import std.exception; 1668 1669 assertNotThrown(throwIfWindowsNewline("", "afile.tsv", 1)); 1670 assertNotThrown(throwIfWindowsNewline("a", "afile.tsv", 2)); 1671 assertNotThrown(throwIfWindowsNewline("ab", "afile.tsv", 3)); 1672 assertNotThrown(throwIfWindowsNewline("abc", "afile.tsv", 4)); 1673 1674 assertThrown(throwIfWindowsNewline("\r", "afile.tsv", 1)); 1675 assertThrown(throwIfWindowsNewline("a\r", "afile.tsv", 2)); 1676 assertThrown(throwIfWindowsNewline("ab\r", "afile.tsv", 3)); 1677 assertThrown(throwIfWindowsNewline("abc\r", "afile.tsv", 4)); 1678 1679 assertNotThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("\n", "afile.tsv", 1)); 1680 assertNotThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("a\n", "afile.tsv", 2)); 1681 assertNotThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("ab\n", "afile.tsv", 3)); 1682 assertNotThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("abc\n", "afile.tsv", 4)); 1683 1684 assertThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("\r\n", "afile.tsv", 5)); 1685 assertThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("a\r\n", "afile.tsv", 6)); 1686 assertThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("ab\r\n", "afile.tsv", 7)); 1687 assertThrown(throwIfWindowsNewline!(No.newlineWasRemoved)("abc\r\n", "afile.tsv", 8)); 1688 1689 /* Standard Input formatting. */ 1690 import std.algorithm : endsWith; 1691 bool exceptionCaught = false; 1692 1693 try (throwIfWindowsNewline("\r", "-", 99)); 1694 catch (Exception e) 1695 { 1696 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1697 exceptionCaught = true; 1698 } 1699 finally 1700 { 1701 assert(exceptionCaught); 1702 exceptionCaught = false; 1703 } 1704 1705 try (throwIfWindowsNewline!(No.newlineWasRemoved)("\r\n", "-", 99)); 1706 catch (Exception e) 1707 { 1708 assert(e.msg.endsWith("File: Standard Input, Line: 99")); 1709 exceptionCaught = true; 1710 } 1711 finally 1712 { 1713 assert(exceptionCaught); 1714 exceptionCaught = false; 1715 } 1716 } 1717 1718 /** 1719 inputSourceRange is a helper function for creating new InputSourceRange objects. 1720 */ 1721 InputSourceRange inputSourceRange(string[] filepaths, ReadHeader readHeader) 1722 { 1723 return new InputSourceRange(filepaths, readHeader); 1724 } 1725 1726 /** 1727 InputSourceRange is an input range that iterates over a set of input files. 1728 1729 InputSourceRange is used to iterate over a set of files passed on the command line. 1730 Files are automatically opened and closed during iteration. The caller can choose to 1731 have header lines read automatically. 1732 1733 The range is created from a set of filepaths. These filepaths are mapped to 1734 InputSource objects during the iteration. This is what enables automatically opening 1735 and closing files and reading the header line. 1736 1737 The motivation for an InputSourceRange is to provide a standard way to look at the 1738 header line of the first input file during command line argument processing, and then 1739 pass the open input file and the header line along to the main processing functions. 1740 This enables a features like named fields to be implemented in a standard way. 1741 1742 Both InputSourceRange and InputSource are reference objects. This keeps their use 1743 limited to a single iteration over the set of files. The files can be iterated again 1744 by creating a new InputSourceRange against the same filepaths. 1745 1746 Currently, InputSourceRange supports files and standard input. It is possible other 1747 types of input sources will be added in the future. 1748 */ 1749 final class InputSourceRange 1750 { 1751 private string[] _filepaths; 1752 private immutable ReadHeader _readHeader; 1753 private InputSource _front; 1754 1755 this(string[] filepaths, ReadHeader readHeader) 1756 { 1757 _filepaths = filepaths.dup; 1758 _readHeader = readHeader; 1759 _front = null; 1760 1761 if (!_filepaths.empty) 1762 { 1763 _front = new InputSource(_filepaths.front, _readHeader); 1764 _front.open; 1765 _filepaths.popFront; 1766 } 1767 } 1768 1769 size_t length() const pure nothrow @safe 1770 { 1771 return empty ? 0 : _filepaths.length + 1; 1772 } 1773 1774 bool empty() const pure nothrow @safe 1775 { 1776 return _front is null; 1777 } 1778 1779 InputSource front() pure @safe 1780 { 1781 assert(!empty, "Attempt to take the front of an empty InputSourceRange"); 1782 return _front; 1783 } 1784 1785 void popFront() 1786 { 1787 assert(!empty, "Attempt to popFront an empty InputSourceRange"); 1788 1789 _front.close; 1790 1791 if (!_filepaths.empty) 1792 { 1793 _front = new InputSource(_filepaths.front, _readHeader); 1794 _front.open; 1795 _filepaths.popFront; 1796 } 1797 else 1798 { 1799 _front = null; 1800 } 1801 } 1802 } 1803 1804 /** 1805 InputSource is a class of objects produced by iterating over an InputSourceRange. 1806 1807 An InputSource object provides access to the open file currently the front element 1808 of an InputSourceRange. The main methods application code is likely to need are: 1809 1810 $(LIST 1811 * `file()` - Returns the File object. The file will be open for reading as long 1812 InputSource instance is the front element of the InputSourceRange it came from. 1813 1814 * `header(KeepTerminator keepTerminator = No.keepTerminator)` - Returns the 1815 header line from the file. An empty string is returned if InputSource range 1816 was created with readHeader=false. 1817 1818 * `name()` - The name of the input source. The name returned is intended for 1819 user error messages. For files, this is the filepath that was passed to 1820 InputSourceRange. For standard input, it is "Standard Input". 1821 ) 1822 1823 An InputSource is a reference object, so the copies will retain the state of the 1824 InputSourceRange front element. In particular, all copies will have the open 1825 state of the front element of the InputSourceRange. 1826 1827 This class is not intended for use outside the context of an InputSourceRange. 1828 */ 1829 final class InputSource 1830 { 1831 import std.stdio; 1832 1833 private immutable string _filepath; 1834 private immutable bool _isStdin; 1835 private bool _isOpen; 1836 private ReadHeader _readHeader; 1837 private bool _hasBeenOpened; 1838 private string _header; 1839 private File _file; 1840 1841 private this(string filepath, ReadHeader readHeader) pure nothrow @safe 1842 { 1843 _filepath = filepath; 1844 _isStdin = filepath == "-"; 1845 _isOpen = false; 1846 _readHeader = readHeader; 1847 _hasBeenOpened = false; 1848 } 1849 1850 /** file returns the File object held by the InputSource. 1851 * 1852 * The File will be open for reading as long as the InputSource instance is the 1853 * front element of the InputSourceRange it came from. 1854 */ 1855 File file() nothrow @safe 1856 { 1857 return _file; 1858 } 1859 1860 /** isReadHeaderEnabled returns true if the header line is being read. 1861 */ 1862 bool isReadHeaderEnabled() const pure nothrow @safe 1863 { 1864 return _readHeader == Yes.readHeader; 1865 } 1866 1867 /** header returns the header line from the input file. 1868 * 1869 * An empty string is returned if InputSource range was created with 1870 * readHeader=false. 1871 */ 1872 string header(KeepTerminator keepTerminator = No.keepTerminator) const pure nothrow @safe 1873 { 1874 assert(_hasBeenOpened); 1875 return (keepTerminator == Yes.keepTerminator || 1876 _header.length == 0 || 1877 _header[$ - 1] != '\n') ? 1878 _header : _header[0 .. $-1]; 1879 } 1880 1881 /** isHeaderEmpty returns true if there is no data for a header, including the 1882 * terminator. 1883 * 1884 * When headers are being read, this true only if the file is empty. 1885 */ 1886 bool isHeaderEmpty() const pure nothrow @safe 1887 { 1888 assert(_hasBeenOpened); 1889 return _header.empty; 1890 } 1891 1892 /** name returns a user friendly name representing the input source. 1893 * 1894 * For files, it is the filepath provided to InputSourceRange. For standard 1895 * input, it is "Standard Input". (Use isStdin() to test for standard input, 1896 * not name(). 1897 */ 1898 string name() const pure nothrow @safe 1899 { 1900 return _isStdin ? "Standard Input" : _filepath; 1901 } 1902 1903 /** isStdin returns true if the input source is Standard Input, false otherwise. 1904 */ 1905 bool isStdin() const pure nothrow @safe 1906 { 1907 return _isStdin; 1908 } 1909 1910 /** isOpen returns true if the input source is open for reading, false otherwise. 1911 * 1912 * "Open" in this context is whether the InputSource object is currently open, 1913 * meaning that it is the front element of the InputSourceRange that created it. 1914 * 1915 * For files, this is also reflected in the state of the underlying File object. 1916 * However, standard input is never actually closed. 1917 */ 1918 bool isOpen() const pure nothrow @safe 1919 { 1920 return _isOpen; 1921 } 1922 1923 private void open() 1924 { 1925 assert(!_isOpen); 1926 assert(!_hasBeenOpened); 1927 1928 _file = isStdin ? stdin : _filepath.File("rb"); 1929 if (_readHeader) _header = _file.readln; 1930 _isOpen = true; 1931 _hasBeenOpened = true; 1932 } 1933 1934 private void close() 1935 { 1936 if (!_isStdin) _file.close; 1937 _isOpen = false; 1938 } 1939 } 1940 1941 // InputSourceRange and InputSource 1942 unittest 1943 { 1944 import std.algorithm : all, each; 1945 import std.array : appender; 1946 import std.exception : assertThrown; 1947 import std.file : rmdirRecurse; 1948 import std.path : buildPath; 1949 import std.range; 1950 import std.stdio; 1951 import tsv_utils.common.unittest_utils; 1952 1953 auto testDir = makeUnittestTempDir("tsv_utils_input_source_range"); 1954 scope(exit) testDir.rmdirRecurse; 1955 1956 string file0 = buildPath(testDir, "file0.txt"); 1957 string file1 = buildPath(testDir, "file1.txt"); 1958 string file2 = buildPath(testDir, "file2.txt"); 1959 string file3 = buildPath(testDir, "file3.txt"); 1960 1961 string file0Header = ""; 1962 string file1Header = "file 1 header\n"; 1963 string file2Header = "file 2 header\n"; 1964 string file3Header = "file 3 header\n"; 1965 1966 string file0Body = ""; 1967 string file1Body = ""; 1968 string file2Body = "file 2 line 1\n"; 1969 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 1970 1971 string file0Data = file0Header ~ file0Body; 1972 string file1Data = file1Header ~ file1Body; 1973 string file2Data = file2Header ~ file2Body; 1974 string file3Data = file3Header ~ file3Body; 1975 1976 { 1977 file0.File("wb").write(file0Data); 1978 file1.File("wb").write(file1Data); 1979 file2.File("wb").write(file2Data); 1980 file3.File("wb").write(file3Data); 1981 } 1982 1983 auto inputFiles = [file0, file1, file2, file3]; 1984 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 1985 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 1986 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 1987 1988 auto readSources = appender!(InputSource[]); 1989 auto buffer = new char[1024]; // Must be large enough to hold the test files. 1990 1991 /* Tests without standard input. Don't want to count on state of standard 1992 * input or modifying it when doing unit tests, so avoid reading from it. 1993 */ 1994 1995 foreach(numFiles; 1 .. inputFiles.length + 1) 1996 { 1997 /* Reading headers. */ 1998 1999 readSources.clear; 2000 auto inputSourcesYesHeader = inputSourceRange(inputFiles[0 .. numFiles], Yes.readHeader); 2001 assert(inputSourcesYesHeader.length == numFiles); 2002 2003 foreach(fileNum, source; inputSourcesYesHeader.enumerate) 2004 { 2005 readSources.put(source); 2006 assert(source.isOpen); 2007 assert(source.file.isOpen); 2008 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 2009 assert(readSources.data[fileNum].isOpen); 2010 2011 assert(source.header(Yes.keepTerminator) == fileHeaders[fileNum]); 2012 2013 auto headerNoTerminatorLength = fileHeaders[fileNum].length; 2014 if (headerNoTerminatorLength > 0) --headerNoTerminatorLength; 2015 assert(source.header(No.keepTerminator) == 2016 fileHeaders[fileNum][0 .. headerNoTerminatorLength]); 2017 2018 assert(source.name == inputFiles[fileNum]); 2019 assert(!source.isStdin); 2020 assert(source.isReadHeaderEnabled); 2021 2022 assert(source.file.rawRead(buffer) == fileBodies[fileNum]); 2023 } 2024 2025 /* The InputSourceRange is a reference range, consumed by the foreach. */ 2026 assert(inputSourcesYesHeader.empty); 2027 2028 /* Without reading headers. */ 2029 2030 readSources.clear; 2031 auto inputSourcesNoHeader = inputSourceRange(inputFiles[0 .. numFiles], No.readHeader); 2032 assert(inputSourcesNoHeader.length == numFiles); 2033 2034 foreach(fileNum, source; inputSourcesNoHeader.enumerate) 2035 { 2036 readSources.put(source); 2037 assert(source.isOpen); 2038 assert(source.file.isOpen); 2039 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 2040 assert(readSources.data[fileNum].isOpen); 2041 2042 assert(source.header(Yes.keepTerminator).empty); 2043 assert(source.header(No.keepTerminator).empty); 2044 2045 assert(source.name == inputFiles[fileNum]); 2046 assert(!source.isStdin); 2047 assert(!source.isReadHeaderEnabled); 2048 2049 assert(source.file.rawRead(buffer) == fileData[fileNum]); 2050 } 2051 2052 /* The InputSourceRange is a reference range, consumed by the foreach. */ 2053 assert(inputSourcesNoHeader.empty); 2054 } 2055 2056 /* Tests with standard input. No actual reading in these tests. 2057 */ 2058 2059 readSources.clear; 2060 foreach(fileNum, source; inputSourceRange(["-", "-"], No.readHeader).enumerate) 2061 { 2062 readSources.put(source); 2063 assert(source.isOpen); 2064 assert(source.file.isOpen); 2065 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); // InputSource objects are "closed". 2066 assert(readSources.data[0 .. fileNum].all!(s => s.file.isOpen)); // Actual stdin should not be closed. 2067 assert(readSources.data[fileNum].isOpen); 2068 2069 assert(source.header(Yes.keepTerminator).empty); 2070 assert(source.header(No.keepTerminator).empty); 2071 2072 assert(source.name == "Standard Input"); 2073 assert(source.isStdin); 2074 } 2075 2076 /* Empty filelist. */ 2077 string[] nofiles; 2078 { 2079 auto sources = inputSourceRange(nofiles, No.readHeader); 2080 assert(sources.empty); 2081 } 2082 { 2083 auto sources = inputSourceRange(nofiles, Yes.readHeader); 2084 assert(sources.empty); 2085 } 2086 2087 /* Error cases. */ 2088 assertThrown(inputSourceRange([file0, "no_such_file.txt"], No.readHeader).each); 2089 assertThrown(inputSourceRange(["no_such_file.txt", file1], Yes.readHeader).each); 2090 } 2091 2092 /** 2093 byLineSourceRange is a helper function for creating new byLineSourceRange objects. 2094 */ 2095 auto byLineSourceRange( 2096 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 2097 (string[] filepaths, LineBuffered lineBuffered = No.lineBuffered, 2098 ReadHeader readHeader = No.readHeader) 2099 if (is(Char == char) || is(Char == ubyte)) 2100 { 2101 return new ByLineSourceRange!(keepTerminator, Char, terminator) 2102 (filepaths, lineBuffered, readHeader); 2103 } 2104 2105 /** 2106 ByLineSourceRange is an input range that iterates over a set of input files. It 2107 provides bufferedByLine access to each file. 2108 2109 A ByLineSourceRange is used to iterate over a set of files passed on the command line. 2110 Files are automatically opened and closed during iteration. The front element of the 2111 range provides access to a bufferedByLine for iterating over the lines in the file. 2112 2113 The range is created from a set of filepaths. These filepaths are mapped to 2114 ByLineSource objects during the iteration. This is what enables automatically opening 2115 and closing files and providing bufferedByLine access. 2116 2117 The motivation behind ByLineSourceRange is to provide a standard way to look at the 2118 header line of the first input file during command line argument processing, and then 2119 pass the open input file along to the main processing functions. This enables 2120 features like named fields to be implemented in a standard way. 2121 2122 Access to the first line of the first file is available after creating the 2123 ByLineSourceRange instance. The first file is opened and a bufferedByLine created. 2124 The first line of the first file is via byLine.front (after checking !byLine.empty). 2125 2126 Buffering is handled by bufferedByLine. Full buffering is used by default, this can be 2127 changed to line buffering by Yes.lineBuffered. When using full buffering, the header 2128 line (first line) of the first file can read as soon as available using Yes.readHeader. 2129 This is only done for the first file, as that is when immediate processing is useful. 2130 2131 Both ByLineSourceRange and ByLineSource are reference objects. This keeps their use 2132 limited to a single iteration over the set of files. The files can be iterated again 2133 by creating a new InputSourceRange against the same filepaths. 2134 2135 Currently, ByLineSourceRange supports files and standard input. It is possible other 2136 types of input sources will be added in the future. 2137 */ 2138 final class ByLineSourceRange( 2139 KeepTerminator keepTerminator = No.keepTerminator, Char = char, ubyte terminator = '\n') 2140 if (is(Char == char) || is(Char == ubyte)) 2141 { 2142 alias ByLineSourceType = ByLineSource!(keepTerminator, char, terminator); 2143 2144 private string[] _filepaths; 2145 private immutable LineBuffered _lineBuffered; 2146 private ByLineSourceType _front; 2147 2148 this(string[] filepaths, LineBuffered lineBuffered = No.lineBuffered, 2149 ReadHeader readHeader = No.readHeader) 2150 { 2151 _filepaths = filepaths.dup; 2152 _lineBuffered = lineBuffered; 2153 _front = null; 2154 2155 if (!_filepaths.empty) 2156 { 2157 _front = new ByLineSourceType(_filepaths.front, _lineBuffered, readHeader); 2158 _front.open; 2159 _filepaths.popFront; 2160 } 2161 } 2162 2163 size_t length() const pure nothrow @safe 2164 { 2165 return empty ? 0 : _filepaths.length + 1; 2166 } 2167 2168 bool empty() const pure nothrow @safe 2169 { 2170 return _front is null; 2171 } 2172 2173 ByLineSourceType front() pure @safe 2174 { 2175 assert(!empty, "Attempt to take the front of an empty ByLineSourceRange"); 2176 return _front; 2177 } 2178 2179 void popFront() 2180 { 2181 assert(!empty, "Attempt to popFront an empty ByLineSourceRange"); 2182 2183 _front.close; 2184 2185 if (!_filepaths.empty) 2186 { 2187 _front = new ByLineSourceType(_filepaths.front, _lineBuffered); 2188 _front.open; 2189 _filepaths.popFront; 2190 } 2191 else 2192 { 2193 _front = null; 2194 } 2195 } 2196 } 2197 2198 /** 2199 ByLineSource is a class of objects produced by iterating over an ByLineSourceRange. 2200 2201 A ByLineSource instance provides a bufferedByLine range for the current the front 2202 element of a ByLineSourceRange. The main methods application code is likely to 2203 need are: 2204 2205 $(LIST 2206 * `byLine()` - Returns the bufferedByLine range accessing the open file. The file 2207 will be open for reading (using the bufferedByLine range) as long as the 2208 ByLineSource instance is the front element of the ByLineSourceRange 2209 it came from. 2210 2211 * `name()` - The name of the input source. The name returned is intended for 2212 user error messages. For files, this is the filepath that was passed to 2213 ByLineSourceRange. For standard input, it is "Standard Input". 2214 ) 2215 2216 A ByLineSource is a reference object, so the copies have the same state as the 2217 ByLineSourceRange front element. In particular, all copies will have the open 2218 state of the front element of the ByLineSourceRange. 2219 2220 This class is not intended for use outside the context of an ByLineSourceRange. 2221 */ 2222 final class ByLineSource( 2223 KeepTerminator keepTerminator, Char = char, ubyte terminator = '\n') 2224 if (is(Char == char) || is(Char == ubyte)) 2225 { 2226 import std.stdio; 2227 import std.traits : ReturnType; 2228 2229 alias newByLineFn = bufferedByLine!(keepTerminator, char, terminator); 2230 alias ByLineType = ReturnType!newByLineFn; 2231 2232 private immutable string _filepath; 2233 private immutable LineBuffered _lineBuffered; 2234 private immutable ReadHeader _readHeader; 2235 private immutable bool _isStdin; 2236 private bool _isOpen; 2237 private bool _hasBeenOpened; 2238 private File _file; 2239 private ByLineType _byLineRange; 2240 2241 private this(string filepath, LineBuffered lineBuffered = No.lineBuffered, 2242 ReadHeader readHeader = No.readHeader) pure nothrow @safe 2243 { 2244 _filepath = filepath; 2245 _lineBuffered = lineBuffered; 2246 _readHeader = readHeader; 2247 _isStdin = filepath == "-"; 2248 _isOpen = false; 2249 _hasBeenOpened = false; 2250 } 2251 2252 /** byLine returns the bufferedByLine object held by the ByLineSource instance. 2253 * 2254 * The File underlying the BufferedByLine object is open for reading as long as 2255 * the ByLineSource instance is the front element of the ByLineSourceRange it 2256 * came from. 2257 */ 2258 ByLineType byLine() nothrow @safe 2259 { 2260 return _byLineRange; 2261 } 2262 2263 /** name returns a user friendly name representing the underlying input source. 2264 * 2265 * For files, it is the filepath provided to ByLineSourceRange. For standard 2266 * input, it is "Standard Input". (Use isStdin() to test for standard input, 2267 * compare against name().) 2268 */ 2269 string name() const pure nothrow @safe 2270 { 2271 return _isStdin ? "Standard Input" : _filepath; 2272 } 2273 2274 /** isStdin returns true if the underlying input source is Standard Input, false 2275 * otherwise. 2276 */ 2277 bool isStdin() const pure nothrow @safe 2278 { 2279 return _isStdin; 2280 } 2281 2282 /** isOpen returns true if the ByLineSource instance is open for reading, false 2283 * otherwise. 2284 * 2285 * "Open" in this context is whether the ByLineSource object is currently "open". 2286 * The underlying input source backing it does not necessarily have the same 2287 * state. The ByLineSource instance is "open" if is the front element of the 2288 * ByLineSourceRange that created it. 2289 * 2290 * The underlying input source object follows the same open/close state as makes 2291 * sense. In particular, real files are closed when the ByLineSource object is 2292 * closed. The exception is standard input, which is never actually closed. 2293 */ 2294 bool isOpen() const pure nothrow @safe 2295 { 2296 return _isOpen; 2297 } 2298 2299 private void open() 2300 { 2301 assert(!_isOpen); 2302 assert(!_hasBeenOpened); 2303 2304 _file = isStdin ? stdin : _filepath.File("rb"); 2305 _byLineRange = newByLineFn(_file, _lineBuffered, _readHeader); 2306 _isOpen = true; 2307 _hasBeenOpened = true; 2308 } 2309 2310 private void close() 2311 { 2312 if (!_isStdin) _file.close; 2313 _isOpen = false; 2314 } 2315 } 2316 2317 // ByLineSourceRange and ByLineSource 2318 unittest 2319 { 2320 import std.algorithm : all, each; 2321 import std.array : appender; 2322 import std.exception : assertThrown; 2323 import std.file : rmdirRecurse; 2324 import std.path : buildPath; 2325 import std.range; 2326 import std.stdio; 2327 import tsv_utils.common.unittest_utils; 2328 2329 auto testDir = makeUnittestTempDir("tsv_utils_byline_input_source_range"); 2330 scope(exit) testDir.rmdirRecurse; 2331 2332 string file0 = buildPath(testDir, "file0.txt"); 2333 string file1 = buildPath(testDir, "file1.txt"); 2334 string file2 = buildPath(testDir, "file2.txt"); 2335 string file3 = buildPath(testDir, "file3.txt"); 2336 2337 string file0Header = ""; 2338 string file1Header = "file 1 header\n"; 2339 string file2Header = "file 2 header\n"; 2340 string file3Header = "file 3 header\n"; 2341 2342 string file0Body = ""; 2343 string file1Body = ""; 2344 string file2Body = "file 2 line 1\n"; 2345 string file3Body = "file 3 line 1\nfile 3 line 2\n"; 2346 2347 string file0Data = file0Header ~ file0Body; 2348 string file1Data = file1Header ~ file1Body; 2349 string file2Data = file2Header ~ file2Body; 2350 string file3Data = file3Header ~ file3Body; 2351 2352 { 2353 file0.File("wb").write(file0Data); 2354 file1.File("wb").write(file1Data); 2355 file2.File("wb").write(file2Data); 2356 file3.File("wb").write(file3Data); 2357 } 2358 2359 auto inputFiles = [file0, file1, file2, file3]; 2360 auto fileHeaders = [file0Header, file1Header, file2Header, file3Header]; 2361 auto fileBodies = [file0Body, file1Body, file2Body, file3Body]; 2362 auto fileData = [file0Data, file1Data, file2Data, file3Data]; 2363 2364 auto buffer = new char[1024]; // Must be large enough to hold the test files. 2365 2366 /* Test without standard input. Don't want to count on state of standard 2367 * input or modifying it when doing unit tests, so avoid reading from it. 2368 */ 2369 2370 static foreach (keepTerm; [No.keepTerminator, Yes.keepTerminator]) 2371 { 2372 foreach (lineBuf; [No.lineBuffered, Yes.lineBuffered]) 2373 { 2374 foreach (readHdr; [No.readHeader, Yes.readHeader]) 2375 { 2376 foreach(numFiles; 1 .. inputFiles.length + 1) 2377 { 2378 auto readSources = appender!(ByLineSource!(keepTerm)[]); 2379 auto inputSources = byLineSourceRange!(keepTerm)(inputFiles[0 .. numFiles], lineBuf, readHdr); 2380 assert(inputSources.length == numFiles); 2381 2382 foreach(fileNum, source; inputSources.enumerate) 2383 { 2384 readSources.put(source); 2385 assert(source.isOpen); 2386 assert(source._file.isOpen); 2387 assert(readSources.data[0 .. fileNum].all!(s => !s.isOpen)); 2388 assert(readSources.data[fileNum].isOpen); 2389 2390 auto headerLength = fileHeaders[fileNum].length; 2391 static if (!keepTerm) 2392 { 2393 if (headerLength > 0) --headerLength; 2394 } 2395 2396 assert(source.byLine.empty || 2397 source.byLine.front == fileHeaders[fileNum][0 .. headerLength]); 2398 2399 assert(source.name == inputFiles[fileNum]); 2400 assert(!source.isStdin); 2401 2402 auto readFileData = appender!(char[]); 2403 foreach(line; source.byLine) 2404 { 2405 readFileData.put(line); 2406 static if (!keepTerm) readFileData.put('\n'); 2407 } 2408 2409 assert(readFileData.data == fileData[fileNum]); 2410 } 2411 2412 /* The ByLineSourceRange is a reference range, consumed by the foreach. */ 2413 assert(inputSources.empty); 2414 } 2415 } 2416 } 2417 } 2418 2419 /* Empty filelist. */ 2420 string[] nofiles; 2421 { 2422 auto sources = byLineSourceRange!(No.keepTerminator)(nofiles); 2423 assert(sources.empty); 2424 } 2425 { 2426 auto sources = byLineSourceRange!(Yes.keepTerminator)(nofiles); 2427 assert(sources.empty); 2428 } 2429 2430 /* Error cases. */ 2431 assertThrown(byLineSourceRange!(No.keepTerminator)([file0, "no_such_file.txt"]).each); 2432 assertThrown(byLineSourceRange!(Yes.keepTerminator)(["no_such_file.txt", file1]).each); 2433 } 2434 2435 /** Defines the 'bufferable' input sources supported by inputSourceByChunk. 2436 * 2437 * This includes std.stdio.File objects and mutable dynamic ubyte arrays. Or, input 2438 * ranges with ubyte elements. 2439 * 2440 * Static, const, and immutable arrays can be sliced to turn them into input ranges. 2441 * 2442 * Note: The element types could easily be generalized much further if that were useful. 2443 * At present, the primary purpose of inputSourceByChunk is to have a range representing 2444 * a buffered file that can also take ubyte arrays as sources for unit testing. 2445 */ 2446 enum bool isBufferableInputSource(R) = 2447 isFileHandle!(Unqual!R) || 2448 (isInputRange!R && is(Unqual!(ElementEncodingType!R) == ubyte) 2449 ); 2450 2451 @safe unittest 2452 { 2453 import std.stdio : stdin; 2454 2455 static assert(isBufferableInputSource!(File)); 2456 static assert(isBufferableInputSource!(typeof(stdin))); 2457 static assert(isBufferableInputSource!(ubyte[])); 2458 static assert(!isBufferableInputSource!(char[])); 2459 static assert(!isBufferableInputSource!(string)); 2460 2461 ubyte[10] staticArray; 2462 const ubyte[1] staticConstArray; 2463 immutable ubyte[1] staticImmutableArray; 2464 const(ubyte)[1] staticArrayConstElts; 2465 immutable(ubyte)[1] staticArrayImmutableElts; 2466 2467 ubyte[] dynamicArray = new ubyte[](10); 2468 const(ubyte)[] dynamicArrayConstElts = new ubyte[](10); 2469 immutable(ubyte)[] dynamicArrayImmutableElts = new ubyte[](10); 2470 const ubyte[] dynamicConstArray = new ubyte[](10); 2471 immutable ubyte[] dynamicImmutableArray = new ubyte[](10); 2472 2473 /* Dynamic mutable arrays are bufferable. */ 2474 static assert(!isBufferableInputSource!(typeof(staticArray))); 2475 static assert(!isBufferableInputSource!(typeof(staticArrayConstElts))); 2476 static assert(!isBufferableInputSource!(typeof(staticArrayImmutableElts))); 2477 static assert(!isBufferableInputSource!(typeof(staticConstArray))); 2478 static assert(!isBufferableInputSource!(typeof(staticImmutableArray))); 2479 2480 static assert(isBufferableInputSource!(typeof(dynamicArray))); 2481 static assert(isBufferableInputSource!(typeof(dynamicArrayConstElts))); 2482 static assert(isBufferableInputSource!(typeof(dynamicArrayImmutableElts))); 2483 static assert(!isBufferableInputSource!(typeof(dynamicConstArray))); 2484 static assert(!isBufferableInputSource!(typeof(dynamicImmutableArray))); 2485 2486 /* Slicing turns all forms into bufferable arrays. */ 2487 static assert(isBufferableInputSource!(typeof(staticArray[]))); 2488 static assert(isBufferableInputSource!(typeof(staticArrayConstElts[]))); 2489 static assert(isBufferableInputSource!(typeof(staticArrayImmutableElts[]))); 2490 static assert(isBufferableInputSource!(typeof(staticConstArray[]))); 2491 static assert(isBufferableInputSource!(typeof(staticImmutableArray[]))); 2492 2493 static assert(isBufferableInputSource!(typeof(dynamicConstArray[]))); 2494 static assert(isBufferableInputSource!(typeof(dynamicImmutableArray[]))); 2495 static assert(isBufferableInputSource!(typeof(dynamicArray[]))); 2496 static assert(isBufferableInputSource!(typeof(dynamicArrayConstElts[]))); 2497 static assert(isBufferableInputSource!(typeof(dynamicArrayImmutableElts[]))); 2498 2499 /* Element type tests. */ 2500 static assert(is(Unqual!(ElementType!(typeof(staticArray))) == ubyte)); 2501 static assert(is(Unqual!(ElementType!(typeof(staticArrayConstElts))) == ubyte)); 2502 static assert(is(Unqual!(ElementType!(typeof(staticArrayImmutableElts))) == ubyte)); 2503 static assert(is(Unqual!(ElementType!(typeof(staticConstArray))) == ubyte)); 2504 static assert(is(Unqual!(ElementType!(typeof(staticImmutableArray))) == ubyte)); 2505 static assert(is(Unqual!(ElementType!(typeof(dynamicArray))) == ubyte)); 2506 static assert(is(Unqual!(ElementType!(typeof(dynamicArrayConstElts))) == ubyte)); 2507 static assert(is(Unqual!(ElementType!(typeof(dynamicArrayImmutableElts))) == ubyte)); 2508 static assert(is(Unqual!(ElementType!(typeof(dynamicConstArray))) == ubyte)); 2509 static assert(is(Unqual!(ElementType!(typeof(dynamicImmutableArray))) == ubyte)); 2510 2511 struct S1 2512 { 2513 void popFront(); 2514 @property bool empty(); 2515 @property ubyte front(); 2516 } 2517 2518 struct S2 2519 { 2520 @property ubyte front(); 2521 void popFront(); 2522 @property bool empty(); 2523 @property auto save() { return this; } 2524 @property size_t length(); 2525 S2 opSlice(size_t, size_t); 2526 } 2527 2528 static assert(isInputRange!S1); 2529 static assert(isBufferableInputSource!S1); 2530 2531 static assert(isInputRange!S2); 2532 static assert(is(ElementEncodingType!S2 == ubyte)); 2533 static assert(hasSlicing!S2); 2534 static assert(isBufferableInputSource!S2); 2535 2536 /* For code coverage. */ 2537 S2 s2; 2538 auto x = s2.save; 2539 2540 auto repeatInt = 7.repeat!int(5); 2541 auto repeatUbyte = 7.repeat!ubyte(5); 2542 auto infiniteUbyte = 7.repeat!ubyte; 2543 2544 static assert(!isBufferableInputSource!(typeof(repeatInt))); 2545 static assert(isBufferableInputSource!(typeof(repeatUbyte))); 2546 static assert(isBufferableInputSource!(typeof(infiniteUbyte))); 2547 } 2548 2549 /** inputSourceByChunk returns a range that reads either a file handle (File) or a 2550 * ubyte[] array a chunk at a time. 2551 * 2552 * This is a cover for File.byChunk that allows passing an in-memory array or input 2553 * range as well. At present the motivation is primarily to enable unit testing of 2554 * chunk-based algorithms using in-memory strings. 2555 * 2556 * inputSourceByChunk takes either a File open for reading or an input range with 2557 * ubyte elements. Data is read a buffer at a time. The buffer can be user provided, 2558 * or allocated by inputSourceByChunk based on a caller provided buffer size. 2559 * 2560 * The primary motivation for supporting both files and input ranges as sources is to 2561 * enable unit testing of buffer based algorithms using in-memory arrays. Dynamic, 2562 * mutable arras are fine. Use slicing to turn a static, const, or immutable arrays 2563 * into an input range. 2564 * 2565 * The chunks are returned as an input range. 2566 */ 2567 auto inputSourceByChunk(InputSource)(InputSource source, size_t size) 2568 { 2569 return inputSourceByChunk(source, new ubyte[](size)); 2570 } 2571 2572 /// Ditto 2573 auto inputSourceByChunk(InputSource)(InputSource source, ubyte[] buffer) 2574 if (isBufferableInputSource!InputSource) 2575 { 2576 static if (isFileHandle!(Unqual!InputSource)) 2577 { 2578 return source.byChunk(buffer); 2579 } 2580 else 2581 { 2582 static struct BufferedChunk 2583 { 2584 private Chunks!InputSource _chunks; 2585 private ubyte[] _buffer; 2586 2587 private void readNextChunk() 2588 { 2589 if (_chunks.empty) 2590 { 2591 _buffer.length = 0; 2592 } 2593 else 2594 { 2595 import std.algorithm : copy; 2596 auto remainingBuffer = _chunks.front.take(_buffer.length).copy(_buffer); 2597 _chunks.popFront; 2598 2599 /* Only the last chunk should be shorter than the buffer. */ 2600 assert(remainingBuffer.length == 0 || _chunks.empty); 2601 2602 _buffer.length -= remainingBuffer.length; 2603 } 2604 } 2605 2606 this(InputSource source, ubyte[] buffer) 2607 { 2608 import std.exception : enforce; 2609 enforce(buffer.length > 0, "buffer size must be larger than 0"); 2610 _chunks = source.chunks(buffer.length); 2611 _buffer = buffer; 2612 readNextChunk(); 2613 } 2614 2615 @property bool empty() 2616 { 2617 return (_buffer.length == 0); 2618 } 2619 2620 @property ubyte[] front() 2621 { 2622 assert(!empty, "Attempting to fetch the front of an empty inputSourceByChunks"); 2623 return _buffer; 2624 } 2625 2626 void popFront() 2627 { 2628 assert(!empty, "Attempting to popFront an empty inputSourceByChunks"); 2629 readNextChunk(); 2630 } 2631 } 2632 2633 return BufferedChunk(source, buffer); 2634 } 2635 } 2636 2637 unittest // inputSourceByChunk 2638 { 2639 import tsv_utils.common.unittest_utils; // tsv-utils unit test helpers 2640 import std.file : mkdir, rmdirRecurse; 2641 import std.path : buildPath; 2642 2643 auto testDir = makeUnittestTempDir("tsv_utils_inputSourceByChunk"); 2644 scope(exit) testDir.rmdirRecurse; 2645 2646 import std.algorithm : equal, joiner; 2647 import std.format; 2648 import std..string : representation; 2649 2650 auto charData = "abcde,ßÀß,あめりか物語,012345"; 2651 ubyte[] ubyteData = charData.dup.representation; 2652 2653 ubyte[1024] rawBuffer; // Must be larger than largest bufferSize in tests. 2654 2655 void writeFileData(string filePath, ubyte[] data) 2656 { 2657 import std.stdio; 2658 2659 auto f = filePath.File("wb"); 2660 f.rawWrite(data); 2661 f.close; 2662 } 2663 2664 foreach (size_t dataSize; 0 .. ubyteData.length) 2665 { 2666 auto data = ubyteData[0 .. dataSize]; 2667 auto filePath = buildPath(testDir, format("data_%d.txt", dataSize)); 2668 writeFileData(filePath, data); 2669 2670 foreach (size_t bufferSize; 1 .. dataSize + 2) 2671 { 2672 assert(data.inputSourceByChunk(bufferSize).joiner.equal(data), 2673 format("[Test-A] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 2674 2675 assert (rawBuffer.length >= bufferSize); 2676 2677 ubyte[] buffer = rawBuffer[0 .. bufferSize]; 2678 assert(data.inputSourceByChunk(buffer).joiner.equal(data), 2679 format("[Test-B] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 2680 2681 { 2682 auto inputStream = filePath.File; 2683 assert(inputStream.inputSourceByChunk(bufferSize).joiner.equal(data), 2684 format("[Test-C] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 2685 inputStream.close; 2686 } 2687 2688 { 2689 auto inputStream = filePath.File; 2690 assert(inputStream.inputSourceByChunk(buffer).joiner.equal(data), 2691 format("[Test-D] dataSize: %d, bufferSize: %d", dataSize, bufferSize)); 2692 inputStream.close; 2693 } 2694 } 2695 } 2696 } 2697 2698 @safe unittest // inputSourceByChunk array cases 2699 { 2700 import std.algorithm : equal; 2701 2702 ubyte[5] staticArray = [5, 6, 7, 8, 9]; 2703 const(ubyte)[5] staticArrayConstElts = [5, 6, 7, 8, 9]; 2704 immutable(ubyte)[5] staticArrayImmutableElts = [5, 6, 7, 8, 9]; 2705 const ubyte[5] staticConstArray = [5, 6, 7, 8, 9]; 2706 immutable ubyte[5] staticImmutableArray = [5, 6, 7, 8, 9]; 2707 2708 ubyte[] dynamicArray = [5, 6, 7, 8, 9]; 2709 const(ubyte)[] dynamicArrayConstElts = [5, 6, 7, 8, 9]; 2710 immutable(ubyte)[] dynamicArrayImmutableElts = [5, 6, 7, 8, 9]; 2711 const ubyte[] dynamicConstArray = [5, 6, 7, 8, 9]; 2712 immutable ubyte[] dynamicImmutableArray = [5, 6, 7, 8, 9]; 2713 2714 /* The dynamic mutable arrays can be used directly. */ 2715 assert (dynamicArray.inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2716 assert (dynamicArrayConstElts.inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2717 assert (dynamicArrayImmutableElts.inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2718 2719 /* All the arrays can be used with slicing. */ 2720 assert (staticArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2721 assert (staticArrayConstElts[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2722 assert (staticArrayImmutableElts[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2723 assert (staticConstArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2724 assert (staticImmutableArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2725 assert (dynamicArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2726 assert (dynamicArrayConstElts[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2727 assert (dynamicArrayImmutableElts[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2728 assert (dynamicConstArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2729 assert (dynamicImmutableArray[].inputSourceByChunk(2).equal([[5, 6], [7, 8], [9]])); 2730 } 2731 2732 @safe unittest // inputSourceByChunk input ranges 2733 { 2734 import std.algorithm : equal; 2735 2736 assert (7.repeat!ubyte(5).inputSourceByChunk(1).equal([[7], [7], [7], [7], [7]])); 2737 assert (7.repeat!ubyte(5).inputSourceByChunk(2).equal([[7, 7], [7, 7], [7]])); 2738 assert (7.repeat!ubyte(5).inputSourceByChunk(3).equal([[7, 7, 7], [7, 7]])); 2739 assert (7.repeat!ubyte(5).inputSourceByChunk(4).equal([[7, 7, 7, 7], [7]])); 2740 assert (7.repeat!ubyte(5).inputSourceByChunk(5).equal([[7, 7, 7, 7, 7]])); 2741 assert (7.repeat!ubyte(5).inputSourceByChunk(6).equal([[7, 7, 7, 7, 7]])); 2742 2743 /* Infinite. */ 2744 assert (7.repeat!ubyte.inputSourceByChunk(2).take(3).equal([[7, 7], [7, 7], [7, 7]])); 2745 }