1 /** 2 Command line tool that prints TSV data aligned for easier reading on consoles 3 and traditional command-line environments. 4 5 Copyright (c) 2017-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_pretty; 11 12 import std.range; 13 import std.stdio; 14 import std.typecons : Flag, Yes, No, tuple; 15 16 version(unittest) 17 { 18 // When running unit tests, use main from -main compiler switch. 19 } 20 else 21 { 22 /** Main program. Invokes command line arg processing and tsv-pretty to perform 23 * the real work. Any errors are caught and reported. 24 */ 25 int main(string[] cmdArgs) 26 { 27 /* When running in DMD code coverage mode, turn on report merging. */ 28 version(D_Coverage) version(DigitalMars) 29 { 30 import core.runtime : dmd_coverSetMerge; 31 dmd_coverSetMerge(true); 32 } 33 34 TsvPrettyOptions options; 35 auto r = options.processArgs(cmdArgs); 36 if (!r[0]) return r[1]; 37 try tsvPretty(options, cmdArgs[1 .. $]); 38 catch (Exception exc) 39 { 40 stderr.writefln("Error [%s]: %s", options.programName, exc.msg); 41 return 1; 42 } 43 return 0; 44 } 45 } 46 47 auto helpTextVerbose = q"EOS 48 Synopsis: tsv-pretty [options] [file...] 49 50 tsv-pretty outputs TSV data in a format intended to be more human readable when 51 working on the command line. This is done primarily by lining up data into 52 fixed-width columns. Text is left aligned, numbers are right aligned. Floating 53 points numbers are aligned on the decimal point when feasible. 54 55 Processing begins by reading the initial set of lines into memory to determine 56 the field widths and data types of each column. This look-ahead buffer is used 57 for header detection as well. Output begins after this processing is complete. 58 59 By default, only the alignment is changed, the actual values are not modified. 60 Several of the formatting options do modify the values. 61 62 Features: 63 64 * Floating point numbers: Floats can be printed in fixed-width precision, using 65 the same precision for all floats in a column. This makes then line up nicely. 66 Precision is determined by values seen during look-ahead processing. The max 67 precision defaults to 9, this can be changed when smaller or larger values are 68 desired. See the '--f|format-floats' and '--p|precision' options. 69 70 * Header lines: Headers are detected automatically when possible. This can be 71 overridden when automatic detection doesn't work as desired. Headers can be 72 underlined and repeated at regular intervals. 73 74 * Missing values: A substitute value can be used for empty fields. This is often 75 less confusing than spaces. See '--e|replace-empty' and '--E|empty-replacement'. 76 77 * Exponential notion: As part float formatting, '--f|format-floats' re-formats 78 columns where exponential notation is found so all the values in the column 79 are displayed using exponential notation with the same precision. 80 81 * Preamble: A number of initial lines can be designated as a preamble and output 82 unchanged. The preamble is before the header, if a header is present. 83 84 * Fonts: Fixed-width fonts are assumed. CJK characters are assumed to be double 85 width. This is not always correct, but works well in most cases. 86 87 Options: 88 EOS"; 89 90 auto helpText = q"EOS 91 Synopsis: tsv-pretty [options] [file...] 92 93 tsv-pretty outputs TSV data in a more human readable format. This is done by lining 94 up data into fixed-width columns. Text is left aligned, numbers are right aligned. 95 Floating points numbers are aligned on the decimal point when feasible. 96 97 Options: 98 EOS"; 99 100 /** TsvPrettyOptions is used to process and store command line options. */ 101 struct TsvPrettyOptions 102 { 103 string programName; 104 bool helpVerbose = false; // --help-verbose 105 bool hasHeader = false; // --H|header (Note: Default false assumed by validation code) 106 bool autoDetectHeader = true; // Derived (Note: Default true assumed by validation code) 107 bool noHeader = false; // --x|no-header (Note: Default false assumed by validation code) 108 size_t lookahead = 1000; // --l|lookahead 109 size_t repeatHeader = 0; // --r|repeat-header num (zero means no repeat) 110 bool underlineHeader = false; // --u|underline-header 111 bool formatFloats = false; // --f|format-floats 112 size_t floatPrecision = 9; // --p|precision num (max precision when formatting floats.) 113 bool replaceEmpty = false; // --e|replace-empty 114 string emptyReplacement = ""; // --E|empty-replacement 115 size_t emptyReplacementPrintWidth = 0; // Derived 116 char delim = '\t'; // --d|delimiter 117 size_t spaceBetweenFields = 2; // --s|space-between-fields num 118 size_t maxFieldPrintWidth = 40; // --m|max-text-width num; Max width for variable width text fields. 119 size_t preambleLines = 0; // --a|preamble; Number of preamble lines. 120 bool versionWanted = false; // --V|version 121 122 /* Returns a tuple. First value is true if command line arguments were successfully 123 * processed and execution should continue, or false if an error occurred or the user 124 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 125 * 126 * Returning true (execution continues) means args have been validated and derived 127 * values calculated. In addition, field indices have been converted to zero-based. 128 * If the whole line is the key, the individual fields list will be cleared. 129 */ 130 auto processArgs (ref string[] cmdArgs) 131 { 132 import std.algorithm : any, each; 133 import std.getopt; 134 import std.path : baseName, stripExtension; 135 136 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 137 138 try 139 { 140 arraySep = ","; // Use comma to separate values in command line options 141 auto r = getopt( 142 cmdArgs, 143 "help-verbose", " Print full help.", &helpVerbose, 144 std.getopt.config.caseSensitive, 145 "H|header", " Treat the first line of each file as a header.", &hasHeader, 146 std.getopt.config.caseInsensitive, 147 "x|no-header", " Assume no header. Turns off automatic header detection.", &noHeader, 148 "l|lookahead", "NUM Lines to read to interpret data before generating output. Default: 1000", &lookahead, 149 150 "r|repeat-header", "NUM Lines to print before repeating the header. Default: No repeating header", &repeatHeader, 151 152 "u|underline-header", " Underline the header.", &underlineHeader, 153 "f|format-floats", " Format floats for better readability. Default: No", &formatFloats, 154 "p|precision", "NUM Max floating point precision. Implies --format-floats. Default: 9", &floatPrecisionOptionHandler, 155 std.getopt.config.caseSensitive, 156 "e|replace-empty", " Replace empty fields with '--'.", &replaceEmpty, 157 "E|empty-replacement", "STR Replace empty fields with a string.", &emptyReplacement, 158 std.getopt.config.caseInsensitive, 159 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 160 "s|space-between-fields", "NUM Spaces between each field (Default: 2)", &spaceBetweenFields, 161 "m|max-text-width", "NUM Max reserved field width for variable width text fields. Default: 40", &maxFieldPrintWidth, 162 "a|preamble", "NUM Treat the first NUM lines as a preamble and output them unchanged.", &preambleLines, 163 std.getopt.config.caseSensitive, 164 "V|version", " Print version information and exit.", &versionWanted, 165 std.getopt.config.caseInsensitive, 166 ); 167 168 if (r.helpWanted) 169 { 170 defaultGetoptPrinter(helpText, r.options); 171 return tuple(false, 0); 172 } 173 else if (helpVerbose) 174 { 175 defaultGetoptPrinter(helpTextVerbose, r.options); 176 return tuple(false, 0); 177 } 178 else if (versionWanted) 179 { 180 import tsvutils_version; 181 writeln(tsvutilsVersionNotice("tsv-pretty")); 182 return tuple(false, 0); 183 } 184 185 /* Validation and derivations. */ 186 if (noHeader && hasHeader) throw new Exception("Cannot specify both --H|header and --x|no-header."); 187 188 if (noHeader || hasHeader) autoDetectHeader = false; 189 190 /* Zero look-ahead has limited utility unless the first line is known to 191 * be a header. Good chance the user will get an unintended behavior. 192 */ 193 if (lookahead == 0 && autoDetectHeader) 194 { 195 assert (!noHeader && !hasHeader); 196 throw new Exception("Cannot auto-detect header with zero look-ahead. Specify either '--H|header' or '--x|no-header' when using '--l|lookahead 0'."); 197 } 198 199 if (emptyReplacement.length != 0) replaceEmpty = true; 200 else if (replaceEmpty) emptyReplacement = "--"; 201 202 if (emptyReplacement.length != 0) 203 { 204 emptyReplacementPrintWidth = emptyReplacement.monospacePrintWidth; 205 } 206 } 207 catch (Exception exc) 208 { 209 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 210 return tuple(false, 1); 211 } 212 return tuple(true, 0); 213 } 214 215 /* Option handler for --p|precision. It also sets --f|format-floats. */ 216 private void floatPrecisionOptionHandler(string option, string optionVal) @safe pure 217 { 218 import std.conv : to; 219 floatPrecision = optionVal.to!size_t; 220 formatFloats = true; 221 } 222 } 223 224 /** tsvPretty is the main loop, operating on input files and passing control to a 225 * TSVPrettyProccessor instance. 226 * 227 * This separates physical I/O sources and sinks from the underlying processing 228 * algorithm, which operates on generic ranges. A lockingTextWriter is created and 229 * released on every input line. This has effect flushing standard output every line, 230 * desirable in command line tools. 231 */ 232 void tsvPretty(in ref TsvPrettyOptions options, string[] files) 233 { 234 auto firstNonPreambleLine = options.preambleLines + 1; 235 auto tpp = TsvPrettyProcessor(options); 236 foreach (filename; (files.length > 0) ? files : ["-"]) 237 { 238 auto inputStream = (filename == "-") ? stdin : filename.File(); 239 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 240 { 241 if (lineNum < firstNonPreambleLine) 242 { 243 tpp.processPreambleLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 244 } 245 else if (lineNum == firstNonPreambleLine) 246 { 247 tpp.processFileFirstLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 248 } 249 else 250 { 251 tpp.processLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 252 } 253 } 254 } 255 tpp.finish(outputRangeObject!(char, char[])(stdout.lockingTextWriter)); 256 } 257 258 /** TsvPrettyProcessor maintains state of processing and exposes operations for 259 * processing individual input lines. 260 * 261 * TsvPrettyProcessor knows that input is file-based, but doesn't deal with actual 262 * files or reading lines from input. That is the job of the caller. Output is 263 * written to an output range. The caller is expected to pass each line to in the 264 * order received, that is an assumption built-into the its processing. 265 * 266 * In addition to the constructor, there are four API methods: 267 * - processPreambleLine - Called to process a preamble line occurring before 268 * the header line or first line of data. 269 * - processFileFirstLine - Called to process the first line of each file. This 270 * enables header processing. 271 * - processLine - Called to process all lines except for the first line a file. 272 * - finish - Called at the end of all processing. This is needed in case the 273 * look-ahead cache is still being filled when input terminates. 274 */ 275 276 struct TsvPrettyProcessor 277 { 278 import std.array : appender; 279 280 private: 281 private enum AutoDetectHeaderResult { none, hasHeader, noHeader }; 282 283 private TsvPrettyOptions _options; 284 private size_t _fileCount = 0; 285 private size_t _dataLineOutputCount = 0; 286 private bool _stillCaching = true; 287 private string _candidateHeaderLine; 288 private auto _lookaheadCache = appender!(string[])(); 289 private FieldFormat[] _fieldVector; 290 private AutoDetectHeaderResult _autoDetectHeaderResult = AutoDetectHeaderResult.none; 291 292 /** Constructor. */ 293 this(const TsvPrettyOptions options) @safe pure nothrow @nogc 294 { 295 _options = options; 296 if (options.noHeader && options.lookahead == 0) _stillCaching = false; 297 } 298 299 invariant 300 { 301 assert(_options.hasHeader || _options.noHeader || _options.autoDetectHeader); 302 assert((_options.lookahead == 0 && _lookaheadCache.data.length == 0) || 303 _lookaheadCache.data.length < _options.lookahead); 304 } 305 306 /** Called to process a preamble line occurring before the header line or first 307 * line of data. 308 */ 309 void processPreambleLine(OutputRange!char outputStream, const char[] line) 310 { 311 if (_fileCount == 0) 312 { 313 put(outputStream, line); 314 put(outputStream, '\n'); 315 } 316 } 317 318 /** Called to process the first line of each file. This enables header processing. */ 319 void processFileFirstLine(OutputRange!char outputStream, const char[] line) 320 { 321 import std.conv : to; 322 323 _fileCount++; 324 325 if (_options.noHeader) 326 { 327 processLine(outputStream, line); 328 } 329 else if (_options.hasHeader) 330 { 331 if (_fileCount == 1) 332 { 333 setHeaderLine(line); 334 if (_options.lookahead == 0) outputLookaheadCache(outputStream); 335 } 336 } 337 else 338 { 339 assert(_options.autoDetectHeader); 340 341 final switch (_autoDetectHeaderResult) 342 { 343 case AutoDetectHeaderResult.noHeader: 344 assert(_fileCount > 1); 345 processLine(outputStream, line); 346 break; 347 348 case AutoDetectHeaderResult.hasHeader: 349 assert(_fileCount > 1); 350 break; 351 352 case AutoDetectHeaderResult.none: 353 if (_fileCount == 1) 354 { 355 assert(_candidateHeaderLine.length == 0); 356 _candidateHeaderLine = line.to!string; 357 } 358 else if (_fileCount == 2) 359 { 360 if (_candidateHeaderLine == line) 361 { 362 _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 363 setHeaderLine(_candidateHeaderLine); 364 365 /* Edge case: First file has only a header line and look-ahead set to zero. */ 366 if (_stillCaching && _options.lookahead == 0) outputLookaheadCache(outputStream); 367 } 368 else 369 { 370 _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; 371 updateFieldFormatsForLine(_candidateHeaderLine); 372 processLine(outputStream, line); 373 } 374 } 375 break; 376 } 377 } 378 } 379 380 /** Called to process all lines except for the first line a file. */ 381 void processLine(OutputRange!char outputStream, const char[] line) 382 { 383 if (_stillCaching) cacheDataLine(outputStream, line); 384 else outputDataLine(outputStream, line); 385 } 386 387 /** Called at the end of all processing. This is needed in case the look-ahead cache 388 * is still being filled when input terminates. 389 */ 390 void finish(OutputRange!char outputStream) 391 { 392 if (_stillCaching) outputLookaheadCache(outputStream); 393 } 394 395 private: 396 /* outputLookaheadCache finalizes processing of the lookahead cache. This includes 397 * Setting the type and width of each field, finalizing the auto-detect header 398 * decision, and outputing all lines in the cache. 399 */ 400 void outputLookaheadCache(OutputRange!char outputStream) 401 { 402 import std.algorithm : splitter; 403 404 assert(_stillCaching); 405 406 if (_options.autoDetectHeader && 407 _autoDetectHeaderResult == AutoDetectHeaderResult.none && 408 _candidateHeaderLine.length != 0) 409 { 410 if (candidateHeaderLooksLikeHeader()) 411 { 412 _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 413 setHeaderLine(_candidateHeaderLine); 414 } 415 else 416 { 417 _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; 418 } 419 } 420 421 422 if (_options.hasHeader || 423 (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) 424 { 425 finalizeFieldFormatting(); 426 outputHeader(outputStream); 427 } 428 else if (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.noHeader && 429 _candidateHeaderLine.length != 0) 430 { 431 updateFieldFormatsForLine(_candidateHeaderLine); 432 finalizeFieldFormatting(); 433 outputDataLine(outputStream, _candidateHeaderLine); 434 } 435 else 436 { 437 finalizeFieldFormatting(); 438 } 439 440 foreach(line; _lookaheadCache.data) outputDataLine(outputStream, line); 441 _lookaheadCache.clear; 442 _stillCaching = false; 443 } 444 445 bool candidateHeaderLooksLikeHeader() @safe 446 { 447 import std.algorithm : splitter; 448 449 /* The candidate header is declared as the header if the look-ahead cache has at least 450 * one numeric field that is text in the candidate header. 451 */ 452 foreach(fieldIndex, fieldValue; _candidateHeaderLine.splitter(_options.delim).enumerate) 453 { 454 auto candidateFieldFormat = FieldFormat(fieldIndex); 455 candidateFieldFormat.updateForFieldValue(fieldValue, _options); 456 if (_fieldVector.length > fieldIndex && 457 candidateFieldFormat.fieldType == FieldType.text && 458 (_fieldVector[fieldIndex].fieldType == FieldType.integer || 459 _fieldVector[fieldIndex].fieldType == FieldType.floatingPoint || 460 _fieldVector[fieldIndex].fieldType == FieldType.exponent)) 461 { 462 return true; 463 } 464 } 465 466 return false; 467 } 468 469 void setHeaderLine(const char[] line) @safe 470 { 471 import std.algorithm : splitter; 472 473 foreach(fieldIndex, header; line.splitter(_options.delim).enumerate) 474 { 475 if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 476 assert(_fieldVector.length > fieldIndex); 477 _fieldVector[fieldIndex].setHeader(header); 478 } 479 } 480 481 void cacheDataLine(OutputRange!char outputStream, const char[] line) 482 { 483 import std.conv : to; 484 485 assert(_lookaheadCache.data.length < _options.lookahead); 486 487 _lookaheadCache ~= line.to!string; 488 updateFieldFormatsForLine(line); 489 if (_lookaheadCache.data.length == _options.lookahead) outputLookaheadCache(outputStream); 490 } 491 492 void updateFieldFormatsForLine(const char[] line) @safe 493 { 494 import std.algorithm : splitter; 495 496 foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) 497 { 498 if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 499 assert(_fieldVector.length > fieldIndex); 500 _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); 501 } 502 503 } 504 505 void finalizeFieldFormatting() @safe pure @nogc nothrow 506 { 507 size_t nextFieldStart = 0; 508 foreach(ref field; _fieldVector) 509 { 510 nextFieldStart = field.finalizeFormatting(nextFieldStart, _options) + _options.spaceBetweenFields; 511 } 512 } 513 514 void outputHeader(OutputRange!char outputStream) 515 { 516 size_t nextOutputPosition = 0; 517 foreach(fieldIndex, ref field; _fieldVector.enumerate) 518 { 519 size_t spacesNeeded = field.startPosition - nextOutputPosition; 520 put(outputStream, repeat(" ", spacesNeeded)); 521 nextOutputPosition += spacesNeeded; 522 nextOutputPosition += field.writeHeader(outputStream, _options); 523 } 524 put(outputStream, '\n'); 525 526 if (_options.underlineHeader) 527 { 528 nextOutputPosition = 0; 529 foreach(fieldIndex, ref field; _fieldVector.enumerate) 530 { 531 size_t spacesNeeded = field.startPosition - nextOutputPosition; 532 put(outputStream, repeat(" ", spacesNeeded)); 533 nextOutputPosition += spacesNeeded; 534 nextOutputPosition += field.writeHeader!(Yes.writeUnderline)(outputStream, _options); 535 } 536 put(outputStream, '\n'); 537 } 538 } 539 540 void outputDataLine(OutputRange!char outputStream, const char[] line) 541 { 542 import std.algorithm : splitter; 543 544 /* Repeating header option. */ 545 if (_options.repeatHeader != 0 && _dataLineOutputCount != 0 && 546 (_options.hasHeader || (_options.autoDetectHeader && 547 _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) && 548 _dataLineOutputCount % _options.repeatHeader == 0) 549 { 550 put(outputStream, '\n'); 551 outputHeader(outputStream); 552 } 553 554 _dataLineOutputCount++; 555 556 size_t nextOutputPosition = 0; 557 foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) 558 { 559 if (fieldIndex == _fieldVector.length) 560 { 561 /* Line is longer than any seen while caching. Add a new FieldFormat entry 562 * and set the line formatting based on this field value. 563 */ 564 _fieldVector ~= FieldFormat(fieldIndex); 565 size_t startPosition = (fieldIndex == 0) ? 566 0 : 567 _fieldVector[fieldIndex - 1].endPosition + _options.spaceBetweenFields; 568 569 _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); 570 _fieldVector[fieldIndex].finalizeFormatting(startPosition, _options); 571 } 572 573 assert(fieldIndex < _fieldVector.length); 574 575 FieldFormat fieldFormat = _fieldVector[fieldIndex]; 576 size_t nextFieldStart = fieldFormat.startPosition; 577 size_t spacesNeeded = (nextOutputPosition < nextFieldStart) ? 578 nextFieldStart - nextOutputPosition : 579 (fieldIndex == 0) ? 0 : 1; // Previous field went long. One space between fields 580 581 put(outputStream, repeat(" ", spacesNeeded)); 582 nextOutputPosition += spacesNeeded; 583 nextOutputPosition += fieldFormat.writeFieldValue(outputStream, nextOutputPosition, fieldValue, _options); 584 } 585 put(outputStream, '\n'); 586 } 587 } 588 589 /** Field types recognized and tracked by tsv-pretty processing. */ 590 enum FieldType { unknown, text, integer, floatingPoint, exponent }; 591 592 /** Field alignments used by tsv-pretty processing. */ 593 enum FieldAlignment { left, right }; 594 595 /** FieldFormat holds all the formatting info needed to format data values in a specific 596 * column. e.g. Field 1 may be text, field 2 may be a float, etc. This is calculated 597 * during the caching phase. Each FieldFormat instance is part of a vector representing 598 * the full row, so each includes the start position on the line and similar data. 599 * 600 * APIs used during the caching phase to gather field value samples 601 * - this - Initial construction. Takes the field index. 602 * - setHeader - Used to set the header text. 603 * - updateForFieldValue - Used to add the next field value sample. 604 * - finalizeFormatting - Used at the end of caching to finalize the format choices. 605 * 606 * APIs used after caching is finished (after finalizeFormatting): 607 * - startPosition - Returns the expected start position for the field. 608 * - endPosition - Returns the expected end position for the field. 609 * - writeHeader - Outputs the header, properly aligned. 610 * - writeFieldValue - Outputs the current field value, properly aligned. 611 */ 612 613 struct FieldFormat 614 { 615 private: 616 size_t _fieldIndex; // Zero-based index in the line 617 string _header = ""; // Original field header 618 size_t _headerPrintWidth = 0; 619 FieldType _type = FieldType.unknown; 620 FieldAlignment _alignment = FieldAlignment.left; 621 size_t _startPosition = 0; 622 size_t _printWidth = 0; 623 size_t _precision = 0; // Number of digits after the decimal point 624 625 /* These are used while doing initial type and print format detection. */ 626 size_t _minRawPrintWidth = 0; 627 size_t _maxRawPrintWidth = 0; 628 size_t _maxDigitsBeforeDecimal = 0; 629 size_t _maxDigitsAfterDecimal = 0; 630 size_t _maxSignificantDigits = 0; // Digits to include in exponential notation 631 632 public: 633 634 /** Initial construction. Takes a field index. */ 635 this(size_t fieldIndex) @safe pure nothrow @nogc 636 { 637 _fieldIndex = fieldIndex; 638 } 639 640 /** Sets the header text. */ 641 void setHeader(const char[] header) @safe 642 { 643 import std.conv : to; 644 645 _header = header.to!string; 646 _headerPrintWidth = _header.monospacePrintWidth; 647 } 648 649 /** Returns the expected start position for the field. */ 650 size_t startPosition() nothrow pure @safe @property 651 { 652 return _startPosition; 653 } 654 655 /** Returns the expected end position for the field. */ 656 size_t endPosition() nothrow pure @safe @property 657 { 658 return _startPosition + _printWidth; 659 } 660 661 /** Returns the type of field. */ 662 FieldType fieldType() nothrow pure @safe @property 663 { 664 return _type; 665 } 666 667 /** Writes the field header or underline characters to the output stream. 668 * 669 * The current output position should have been written up to the field's start position, 670 * including any spaces between fields. Unlike data fields, there is no need to correct 671 * for previous fields that have run long. This routine does not output trailing spaces. 672 * This makes it simpler for lines to avoid unnecessary trailing spaces. 673 * 674 * Underlines can either be written the full width of the field or the just under the 675 * text of the header. At present this is a template parameter (compile-time). 676 * 677 * The print width of the output is returned. 678 */ 679 size_t writeHeader (Flag!"writeUnderline" writeUnderline = No.writeUnderline, 680 Flag!"fullWidthUnderline" fullWidthUnderline = No.fullWidthUnderline) 681 (OutputRange!char outputStream, in ref TsvPrettyOptions options) 682 { 683 import std.range : repeat; 684 685 size_t positionsWritten = 0; 686 if (_headerPrintWidth > 0) 687 { 688 static if (writeUnderline) 689 { 690 static if (fullWidthUnderline) 691 { 692 put(outputStream, repeat("-", _printWidth)); 693 positionsWritten += _printWidth; 694 } 695 else // Underline beneath the header text only 696 { 697 if (_alignment == FieldAlignment.right) 698 { 699 put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 700 positionsWritten += _printWidth - _headerPrintWidth; 701 } 702 put(outputStream, repeat("-", _headerPrintWidth)); 703 positionsWritten += _headerPrintWidth; 704 } 705 } 706 else 707 { 708 if (_alignment == FieldAlignment.right) 709 { 710 put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 711 positionsWritten += _printWidth - _headerPrintWidth; 712 } 713 put(outputStream, _header); 714 positionsWritten += _headerPrintWidth; 715 } 716 } 717 return positionsWritten; 718 } 719 720 /** Writes the field value for the current column. 721 * 722 * The caller needs to generate output at least to the column's start position, but 723 * can go beyond if previous fields have run long. 724 * 725 * The field value is aligned properly in the field. Either left aligned (text) or 726 * right aligned (numeric). Floating point fields are both right aligned and 727 * decimal point aligned. The number of bytes written is returned. Trailing spaces 728 * are not added, the caller must add any necessary trailing spaces prior to 729 * printing the next field. 730 */ 731 size_t writeFieldValue(OutputRange!char outputStream, size_t currPosition, 732 const char[] fieldValue, in ref TsvPrettyOptions options) 733 in 734 { 735 assert(currPosition >= _startPosition); // Caller resposible for advancing to field start position. 736 assert(_type == FieldType.text || _type == FieldType.integer || 737 _type == FieldType.floatingPoint || _type == FieldType.exponent); 738 } 739 body 740 { 741 import std.algorithm : find, max, min; 742 import std.conv : to, ConvException; 743 import std.format : format; 744 745 /* Create the print version of the string. Either the raw value or a formatted 746 * version of a float. 747 */ 748 string printValue; 749 if (!options.formatFloats || _type == FieldType.text || _type == FieldType.integer) 750 { 751 printValue = fieldValue.to!string; 752 } 753 else 754 { 755 assert(options.formatFloats); 756 assert(_type == FieldType.exponent || _type == FieldType.floatingPoint); 757 758 if (_type == FieldType.exponent) 759 { 760 printValue = fieldValue.formatExponentValue(_precision); 761 } 762 else 763 { 764 printValue = fieldValue.formatFloatingPointValue(_precision); 765 } 766 } 767 768 if (printValue.length == 0 && options.replaceEmpty) printValue = options.emptyReplacement; 769 size_t printValuePrintWidth = printValue.monospacePrintWidth; 770 771 /* Calculate leading spaces needed for right alignment. */ 772 size_t leadingSpaces = 0; 773 if (_alignment == FieldAlignment.right) 774 { 775 /* Target width adjusts the column width to account for overrun by the previous field. */ 776 size_t targetWidth; 777 if (currPosition == _startPosition) 778 { 779 targetWidth = _printWidth; 780 } 781 else 782 { 783 size_t startGap = currPosition - _startPosition; 784 targetWidth = max(printValuePrintWidth, 785 startGap < _printWidth ? _printWidth - startGap : 0); 786 } 787 788 leadingSpaces = (printValuePrintWidth < targetWidth) ? 789 targetWidth - printValuePrintWidth : 0; 790 791 /* The above calculation assumes the print value is fully right aligned. 792 * This is not correct when raw value floats are being used rather than 793 * formatted floats, as different values will have different precision. 794 * The next adjustment accounts for this, dropping leading spaces as 795 * needed to align the decimal point. Note that text and exponential 796 * values get aligned strictly against right boundaries. 797 */ 798 if (leadingSpaces > 0 && _precision > 0 && 799 _type == FieldType.floatingPoint && !options.formatFloats) 800 { 801 import std.algorithm : canFind, findSplit; 802 import std.string : isNumeric; 803 804 if (printValue.isNumeric && !printValue.canFind!(x => x == 'e' || x == 'E')) 805 { 806 size_t decimalAndDigitsLength = printValue.find(".").length; 807 size_t trailingSpaces = 808 (decimalAndDigitsLength == 0) ? _precision + 1 : 809 (decimalAndDigitsLength > _precision) ? 0 : 810 _precision + 1 - decimalAndDigitsLength; 811 812 leadingSpaces = (leadingSpaces > trailingSpaces) ? 813 leadingSpaces - trailingSpaces : 0; 814 } 815 } 816 } 817 put(outputStream, repeat(' ', leadingSpaces)); 818 put(outputStream, printValue); 819 return printValuePrintWidth + leadingSpaces; 820 } 821 822 /** Updates type and format given a new field value. 823 * 824 * This is called during look-ahead caching to register a new sample value for the 825 * column. The key components updates are field type and print width. 826 */ 827 void updateForFieldValue(const char[] fieldValue, in ref TsvPrettyOptions options) @safe 828 { 829 import std.algorithm : findAmong, findSplit, max, min; 830 import std.conv : to, ConvException; 831 import std.string : isNumeric; 832 833 size_t fieldValuePrintWidth = fieldValue.monospacePrintWidth; 834 size_t fieldValuePrintWidthWithEmpty = 835 (fieldValuePrintWidth == 0 && options.replaceEmpty) ? 836 options.emptyReplacementPrintWidth : 837 fieldValuePrintWidth; 838 839 _maxRawPrintWidth = max(_maxRawPrintWidth, fieldValuePrintWidthWithEmpty); 840 _minRawPrintWidth = (_minRawPrintWidth == 0) ? 841 fieldValuePrintWidthWithEmpty : 842 min(_minRawPrintWidth, fieldValuePrintWidthWithEmpty); 843 844 if (_type == FieldType.text) 845 { 846 /* Already text, can't become anything else. */ 847 } 848 else if (fieldValuePrintWidth == 0) 849 { 850 /* Don't let an empty field override a numeric field type. */ 851 } 852 else if (!fieldValue.isNumeric) 853 { 854 /* Not parsable as a number. Switch from unknown or numeric type to text. */ 855 _type = FieldType.text; 856 } 857 else 858 { 859 /* Field type is currently unknown or numeric, and current field parses as numeric. 860 * See if it parses as integer or float. Integers will parse as floats, so try 861 * integer types first. 862 */ 863 FieldType parsesAs = FieldType.unknown; 864 long longValue; 865 ulong ulongValue; 866 double doubleValue; 867 try 868 { 869 longValue = fieldValue.to!long; 870 parsesAs = FieldType.integer; 871 } 872 catch (ConvException) 873 { 874 try 875 { 876 ulongValue = fieldValue.to!ulong; 877 parsesAs = FieldType.integer; 878 } 879 catch (ConvException) 880 { 881 try 882 { 883 doubleValue = fieldValue.to!double; 884 import std.algorithm : findAmong; 885 parsesAs = (fieldValue.findAmong("eE").length == 0) ? 886 FieldType.floatingPoint : FieldType.exponent; 887 } 888 catch (ConvException) 889 { 890 /* Note: This means isNumeric thinks it's a number, but conversions all failed. */ 891 parsesAs = FieldType.text; 892 } 893 } 894 } 895 896 if (parsesAs == FieldType.text) 897 { 898 /* Not parsable as a number (despite isNumeric result). Switch to text type. */ 899 _type = FieldType.text; 900 } 901 else if (parsesAs == FieldType.exponent) 902 { 903 /* Exponential notion supersedes both vanilla floats and integers. */ 904 _type = FieldType.exponent; 905 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 906 907 if (auto decimalSplit = fieldValue.findSplit(".")) 908 { 909 auto fromExponent = decimalSplit[2].findAmong("eE"); 910 size_t numDigitsAfterDecimal = decimalSplit[2].length - fromExponent.length; 911 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 912 _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, numDigitsAfterDecimal); 913 } 914 else 915 { 916 /* Exponent without a decimal point. */ 917 auto fromExponent = fieldValue.findAmong("eE"); 918 assert(fromExponent.length > 0); 919 size_t numDigits = fieldValue.length - fromExponent.length; 920 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, numDigits); 921 } 922 } 923 else if (parsesAs == FieldType.floatingPoint) 924 { 925 /* Floating point supercedes integer but not exponential. */ 926 if (_type != FieldType.exponent) _type = FieldType.floatingPoint; 927 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 928 929 if (auto decimalSplit = fieldValue.findSplit(".")) 930 { 931 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 932 _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, decimalSplit[2].length); 933 } 934 } 935 else 936 { 937 assert(parsesAs == FieldType.integer); 938 if (_type != FieldType.floatingPoint) _type = FieldType.integer; 939 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 940 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, fieldValue.length); 941 } 942 } 943 } 944 945 /** Updates field formatting info based on the current state. It is expected to be 946 * called after adding field entries via updateForFieldValue(). It returns its new 947 * end position. 948 */ 949 size_t finalizeFormatting (size_t startPosition, in ref TsvPrettyOptions options) @safe pure @nogc nothrow 950 { 951 import std.algorithm : max, min; 952 _startPosition = startPosition; 953 if (_type == FieldType.unknown) _type = FieldType.text; 954 _alignment = (_type == FieldType.integer || _type == FieldType.floatingPoint 955 || _type == FieldType.exponent) ? 956 FieldAlignment.right : 957 FieldAlignment.left; 958 959 if (_type == FieldType.floatingPoint) 960 { 961 size_t precision = min(options.floatPrecision, _maxDigitsAfterDecimal); 962 size_t maxValueWidth = _maxDigitsBeforeDecimal + precision; 963 if (precision > 0) maxValueWidth++; // Account for the decimal point. 964 _printWidth = max(1, _headerPrintWidth, maxValueWidth); 965 _precision = precision; 966 } 967 else if (_type == FieldType.exponent) 968 { 969 size_t maxPrecision = (_maxSignificantDigits > 0) ? _maxSignificantDigits - 1 : 0; 970 _precision = min(options.floatPrecision, maxPrecision); 971 972 size_t maxValuePrintWidth = !options.formatFloats ? _maxRawPrintWidth : _precision + 7; 973 _printWidth = max(1, _headerPrintWidth, maxValuePrintWidth); 974 } 975 else if (_type == FieldType.integer) 976 { 977 _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, _maxRawPrintWidth); 978 _precision = 0; 979 } 980 else 981 { 982 _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, 983 min(options.maxFieldPrintWidth, _maxRawPrintWidth)); 984 _precision = 0; 985 } 986 987 return _startPosition + _printWidth; 988 } 989 } 990 991 /** formatFloatingPointValue returns the printed representation of a raw value 992 * formatted as a fixed precision floating number. This includes zero padding or 993 * truncation of trailing digits as necessary to meet the desired precision. 994 * 995 * If the value cannot be interpreted as a double then the raw value is returned. 996 * Similarly, values in exponential notion are returned without reformatting. 997 * 998 * This routine is used to format values in columns identified as floating point. 999 */ 1000 string formatFloatingPointValue(const char[] value, size_t precision) @safe 1001 { 1002 import std.algorithm : canFind, find; 1003 import std.array : join; 1004 import std.conv : to, ConvException; 1005 import std.format : format; 1006 import std.math : isFinite; 1007 import std.range : repeat; 1008 1009 string printValue; 1010 1011 if (value.canFind!(x => x == 'e' || x == 'E')) 1012 { 1013 /* Exponential notion. Use the raw value. */ 1014 printValue = value.to!string; 1015 } 1016 else 1017 { 1018 try 1019 { 1020 double doubleValue = value.to!double; 1021 if (doubleValue.isFinite) 1022 { 1023 size_t numPrecisionDigits = value.precisionDigits; 1024 if (numPrecisionDigits >= precision) 1025 { 1026 printValue = format("%.*f", precision, doubleValue); 1027 } 1028 else if (numPrecisionDigits == 0) 1029 { 1030 printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ "." ~ repeat("0", precision).join; 1031 } 1032 else 1033 { 1034 printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ repeat("0", precision - numPrecisionDigits).join; 1035 } 1036 } 1037 else printValue = value.to!string; // NaN or Infinity 1038 } 1039 catch (ConvException) printValue = value.to!string; 1040 } 1041 return printValue; 1042 } 1043 1044 @safe unittest 1045 { 1046 assert("".formatFloatingPointValue(3) == ""); 1047 assert(" ".formatFloatingPointValue(3) == " "); 1048 assert("abc".formatFloatingPointValue(3) == "abc"); 1049 assert("nan".formatFloatingPointValue(3) == "nan"); 1050 assert("0".formatFloatingPointValue(0) == "0"); 1051 assert("1".formatFloatingPointValue(0) == "1"); 1052 assert("1.".formatFloatingPointValue(0) == "1"); 1053 assert("1".formatFloatingPointValue(3) == "1.000"); 1054 assert("1000".formatFloatingPointValue(3) == "1000.000"); 1055 assert("1000.001".formatFloatingPointValue(5) == "1000.00100"); 1056 assert("1000.001".formatFloatingPointValue(3) == "1000.001"); 1057 assert("1000.001".formatFloatingPointValue(2) == "1000.00"); 1058 assert("1000.006".formatFloatingPointValue(2) == "1000.01"); 1059 assert("-0.1".formatFloatingPointValue(1) == "-0.1"); 1060 assert("-0.1".formatFloatingPointValue(3) == "-0.100"); 1061 assert("-0.001".formatFloatingPointValue(3) == "-0.001"); 1062 assert("-0.006".formatFloatingPointValue(2) == "-0.01"); 1063 assert("-0.001".formatFloatingPointValue(1) == "-0.0"); 1064 assert("-0.001".formatFloatingPointValue(0) == "-0"); 1065 assert("0e+00".formatFloatingPointValue(0) == "0e+00"); 1066 assert("0.00e+00".formatFloatingPointValue(0) == "0.00e+00"); 1067 assert("1e+06".formatFloatingPointValue(1) == "1e+06"); 1068 assert("1e+06".formatFloatingPointValue(2) == "1e+06"); 1069 assert("1E-06".formatFloatingPointValue(1) == "1E-06"); 1070 assert("1.1E+6".formatFloatingPointValue(2) == "1.1E+6"); 1071 assert("1.1E+100".formatFloatingPointValue(2) == "1.1E+100"); 1072 } 1073 1074 /** formatExponentValue returns the printed representation of a raw value formatted 1075 * using exponential notation and a specific precision. If the value cannot be interpreted 1076 * as a double then the a copy of the original value is returned. 1077 * 1078 * This routine is used to format values in columns identified as having exponent format. 1079 */ 1080 string formatExponentValue(const char[] value, size_t precision) @safe 1081 { 1082 import std.algorithm : canFind, find, findSplit; 1083 import std.array : join; 1084 import std.conv : to, ConvException; 1085 import std.format : format; 1086 import std.math : isFinite; 1087 import std.range : repeat; 1088 1089 string printValue; 1090 try 1091 { 1092 double doubleValue = value.to!double; 1093 if (doubleValue.isFinite) 1094 { 1095 size_t numSignificantDigits = value.significantDigits; 1096 size_t numPrecisionDigits = (numSignificantDigits == 0) ? 0 : numSignificantDigits - 1; 1097 if (numPrecisionDigits >= precision) 1098 { 1099 printValue = format("%.*e", precision, doubleValue); 1100 } 1101 else 1102 { 1103 string unpaddedPrintValue = format("%.*e", numPrecisionDigits, doubleValue); 1104 auto exponentSplit = unpaddedPrintValue.findSplit("e"); // Uses the same exponent case as format call. 1105 if (numPrecisionDigits == 0) 1106 { 1107 assert(precision != 0); 1108 assert(!exponentSplit[0].canFind(".")); 1109 printValue = exponentSplit[0] ~ "." ~ repeat("0", precision).join ~ exponentSplit[1] ~ exponentSplit[2]; 1110 } 1111 else 1112 { 1113 printValue = exponentSplit[0] ~ repeat("0", precision - numPrecisionDigits).join ~ exponentSplit[1] ~ exponentSplit[2]; 1114 } 1115 } 1116 } 1117 else printValue = value.to!string; // NaN or Infinity 1118 } 1119 catch (ConvException) printValue = value.to!string; 1120 1121 return printValue; 1122 } 1123 1124 @safe unittest 1125 { 1126 assert("".formatExponentValue(3) == ""); 1127 assert(" ".formatExponentValue(3) == " "); 1128 assert("abc".formatExponentValue(3) == "abc"); 1129 assert("nan".formatExponentValue(3) == "nan"); 1130 assert("0".formatExponentValue(0) == "0e+00"); 1131 assert("1".formatExponentValue(0) == "1e+00"); 1132 assert("1.".formatExponentValue(0) == "1e+00"); 1133 assert("1".formatExponentValue(3) == "1.000e+00"); 1134 assert("1000".formatExponentValue(3) == "1.000e+03"); 1135 assert("1000.001".formatExponentValue(5) == "1.00000e+03"); 1136 assert("1000.001".formatExponentValue(3) == "1.000e+03"); 1137 assert("1000.001".formatExponentValue(6) == "1.000001e+03"); 1138 assert("1000.006".formatExponentValue(5) == "1.00001e+03"); 1139 assert("-0.1".formatExponentValue(1) == "-1.0e-01"); 1140 assert("-0.1".formatExponentValue(3) == "-1.000e-01"); 1141 assert("-0.001".formatExponentValue(3) == "-1.000e-03"); 1142 assert("-0.001".formatExponentValue(1) == "-1.0e-03"); 1143 assert("-0.001".formatExponentValue(0) == "-1e-03"); 1144 assert("0e+00".formatExponentValue(0) == "0e+00"); 1145 assert("0.00e+00".formatExponentValue(0) == "0e+00"); 1146 assert("1e+06".formatExponentValue(1) == "1.0e+06"); 1147 assert("1e+06".formatExponentValue(2) == "1.00e+06"); 1148 assert("1.0001e+06".formatExponentValue(1) == "1.0e+06"); 1149 assert("1.0001e+06".formatExponentValue(5) == "1.00010e+06"); 1150 } 1151 1152 /** Returns the number of significant digits in a numeric string. 1153 * 1154 * Significant digits are those needed to represent a number in exponential notation. 1155 * Examples: 1156 * 22.345 - 5 digits 1157 * 10.010 - 4 digits 1158 * 0.0032 - 2 digits 1159 */ 1160 size_t significantDigits(const char[] numericString) @safe pure 1161 { 1162 import std.algorithm : canFind, find, findAmong, findSplit, stripRight; 1163 import std.ascii : isDigit; 1164 import std.math : isFinite; 1165 import std.string : isNumeric; 1166 import std.conv : to; 1167 assert (numericString.isNumeric); 1168 1169 size_t significantDigits = 0; 1170 if (numericString.to!double.isFinite) 1171 { 1172 auto digitsPart = numericString.find!(x => x.isDigit && x != '0'); 1173 auto exponentPart = digitsPart.findAmong("eE"); 1174 digitsPart = digitsPart[0 .. $ - exponentPart.length]; 1175 1176 if (digitsPart.canFind('.')) 1177 { 1178 digitsPart = digitsPart.stripRight('0'); 1179 significantDigits = digitsPart.length - 1; 1180 } 1181 else 1182 { 1183 significantDigits = digitsPart.length; 1184 } 1185 1186 if (significantDigits == 0) significantDigits = 1; 1187 } 1188 1189 return significantDigits; 1190 } 1191 1192 @safe pure unittest 1193 { 1194 assert("0".significantDigits == 1); 1195 assert("10".significantDigits == 2); 1196 assert("0.0".significantDigits == 1); 1197 assert("-10.0".significantDigits == 2); 1198 assert("-.01".significantDigits == 1); 1199 assert("-.5401".significantDigits == 4); 1200 assert("1010.010".significantDigits == 6); 1201 assert("0.0003003".significantDigits == 4); 1202 assert("6e+06".significantDigits == 1); 1203 assert("6.0e+06".significantDigits == 1); 1204 assert("6.5e+06".significantDigits == 2); 1205 assert("6.005e+06".significantDigits == 4); 1206 } 1207 1208 /** Returns the number of digits to the right of the decimal point in a numeric string. 1209 * This routine includes trailing zeros in the count. 1210 */ 1211 size_t precisionDigits(const char[] numericString) @safe pure 1212 { 1213 import std.algorithm : canFind, find, findAmong, findSplit, stripRight; 1214 import std.ascii : isDigit; 1215 import std.math : isFinite; 1216 import std.string : isNumeric; 1217 import std.conv : to; 1218 assert (numericString.isNumeric); 1219 1220 size_t precisionDigits = 0; 1221 if (numericString.to!double.isFinite) 1222 { 1223 if (auto decimalSplit = numericString.findSplit(".")) 1224 { 1225 auto exponentPart = decimalSplit[2].findAmong("eE"); 1226 precisionDigits = decimalSplit[2].length - exponentPart.length; 1227 } 1228 } 1229 1230 return precisionDigits; 1231 } 1232 1233 @safe pure unittest 1234 { 1235 assert("0".precisionDigits == 0); 1236 assert("10".precisionDigits == 0); 1237 assert("0.0".precisionDigits == 1); 1238 assert("-10.0".precisionDigits == 1); 1239 assert("-.01".precisionDigits == 2); 1240 assert("-.5401".precisionDigits == 4); 1241 } 1242 1243 /** Calculates the expected print width of a string in monospace (fixed-width) fonts. 1244 */ 1245 size_t monospacePrintWidth(const char[] str) @safe nothrow 1246 { 1247 bool isCJK(dchar c) 1248 { 1249 return c >= '\u3000' && c <= '\u9fff'; 1250 } 1251 1252 import std.uni : byGrapheme; 1253 1254 size_t width = 0; 1255 try foreach (g; str.byGrapheme) width += isCJK(g[0]) ? 2 : 1; 1256 catch (Exception) width = str.length; // Invalid utf-8 sequence. Catch avoids program failure. 1257 1258 return width; 1259 } 1260 1261 unittest 1262 { 1263 assert("".monospacePrintWidth == 0); 1264 assert(" ".monospacePrintWidth == 1); 1265 assert("abc".monospacePrintWidth == 3); 1266 assert("林檎".monospacePrintWidth == 4); 1267 assert("æble".monospacePrintWidth == 4); 1268 assert("ვაშლი".monospacePrintWidth == 5); 1269 assert("größten".monospacePrintWidth == 7); 1270 }