1 /** 2 Command line tool that prints TSV data aligned for easier reading on consoles 3 and traditional command-line environments. 4 5 Copyright (c) 2017-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_pretty; 11 12 import std.range; 13 import std.stdio; 14 import std.typecons : Flag, Yes, No, tuple; 15 16 version(unittest) 17 { 18 // When running unit tests, use main from -main compiler switch. 19 } 20 else 21 { 22 int main(string[] cmdArgs) 23 { 24 /* When running in DMD code coverage mode, turn on report merging. */ 25 version(D_Coverage) version(DigitalMars) 26 { 27 import core.runtime : dmd_coverSetMerge; 28 dmd_coverSetMerge(true); 29 } 30 31 TsvPrettyOptions options; 32 auto r = options.processArgs(cmdArgs); 33 if (!r[0]) return r[1]; 34 try tsvPretty(options, cmdArgs[1 .. $]); 35 catch (Exception exc) 36 { 37 stderr.writefln("Error [%s]: %s", options.programName, exc.msg); 38 return 1; 39 } 40 return 0; 41 } 42 } 43 44 auto helpTextVerbose = q"EOS 45 Synopsis: tsv-pretty [options] [file...] 46 47 tsv-pretty outputs TSV data in a format intended to be more human readable when 48 working on the command line. This is done primarily by lining up data into 49 fixed-width columns. Text is left aligned, numbers are right aligned. Floating 50 points numbers are aligned on the decimal point when feasible. 51 52 Processing begins by reading the initial set of lines into memory to determine 53 the field widths and data types of each column. This look-ahead buffer is used 54 for header detection as well. Output begins after this processing is complete. 55 56 By default, only the alignment is changed, the actual values are not modified. 57 Several of the formatting options do modify the values. 58 59 Features: 60 61 * Floating point numbers: Floats can be printed in fixed-width precision, using 62 the same precision for all floats in a column. This makes then line up nicely. 63 Precision is determined by values seen during look-ahead processing. The max 64 precision defaults to 9, this can be changed when smaller or larger values are 65 desired. See the '--f|format-floats' and '--p|precision' options. 66 67 * Header lines: Headers are detected automatically when possible. This can be 68 overridden when automatic detection doesn't work as desired. Headers can be 69 underlined and repeated at regular intervals. 70 71 * Missing values: A substitute value can be used for empty fields. This is often 72 less confusing than spaces. See '--e|replace-empty' and '--E|empty-replacement'. 73 74 * Exponential notion: As part float formatting, '--f|format-floats' re-formats 75 columns where exponential notation is found so all the values in the column 76 are displayed using exponential notation with the same precision. 77 78 * Preamble: A number of initial lines can be designated as a preamble and output 79 unchanged. The preamble is before the header, if a header is present. 80 81 * Fonts: Fixed-width fonts are assumed. CJK characters are assumed to be double 82 width. This is not always correct, but works well in most cases. 83 84 Options: 85 EOS"; 86 87 auto helpText = q"EOS 88 Synopsis: tsv-pretty [options] [file...] 89 90 tsv-pretty outputs TSV data in a more human readable format. This is done by lining 91 up data into fixed-width columns. Text is left aligned, numbers are right aligned. 92 Floating points numbers are aligned on the decimal point when feasible. 93 94 Options: 95 EOS"; 96 97 /* TsvPrettyOptions is used to process and store command line options. */ 98 struct TsvPrettyOptions 99 { 100 string programName; 101 bool helpVerbose = false; // --help-verbose 102 bool hasHeader = false; // --H|header (Note: Default false assumed by validation code) 103 bool autoDetectHeader = true; // Derived (Note: Default true assumed by validation code) 104 bool noHeader = false; // --x|no-header (Note: Default false assumed by validation code) 105 size_t lookahead = 1000; // --l|lookahead 106 size_t repeatHeader = 0; // --r|repeat-header num (zero means no repeat) 107 bool underlineHeader = false; // --u|underline-header 108 bool formatFloats = false; // --f|format-floats 109 size_t floatPrecision = 9; // --p|precision num (max precision when formatting floats.) 110 bool replaceEmpty = false; // --e|replace-empty 111 string emptyReplacement = ""; // --E|empty-replacement 112 size_t emptyReplacementPrintWidth = 0; // Derived 113 char delim = '\t'; // --d|delimiter 114 size_t spaceBetweenFields = 2; // --s|space-between-fields num 115 size_t maxFieldPrintWidth = 40; // --m|max-text-width num; Max width for variable width text fields. 116 size_t preambleLines = 0; // --a|preamble; Number of preamble lines. 117 bool versionWanted = false; // --V|version 118 119 /* Returns a tuple. First value is true if command line arguments were successfully 120 * processed and execution should continue, or false if an error occurred or the user 121 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 122 * 123 * Returning true (execution continues) means args have been validated and derived 124 * values calculated. In addition, field indices have been converted to zero-based. 125 * If the whole line is the key, the individual fields list will be cleared. 126 */ 127 auto processArgs (ref string[] cmdArgs) 128 { 129 import std.algorithm : any, each; 130 import std.getopt; 131 import std.path : baseName, stripExtension; 132 133 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 134 135 try 136 { 137 arraySep = ","; // Use comma to separate values in command line options 138 auto r = getopt( 139 cmdArgs, 140 "help-verbose", " Print full help.", &helpVerbose, 141 std.getopt.config.caseSensitive, 142 "H|header", " Treat the first line of each file as a header.", &hasHeader, 143 std.getopt.config.caseInsensitive, 144 "x|no-header", " Assume no header. Turns off automatic header detection.", &noHeader, 145 "l|lookahead", "NUM Lines to read to interpret data before generating output. Default: 1000", &lookahead, 146 147 "r|repeat-header", "NUM Lines to print before repeating the header. Default: No repeating header", &repeatHeader, 148 149 "u|underline-header", " Underline the header.", &underlineHeader, 150 "f|format-floats", " Format floats for better readability. Default: No", &formatFloats, 151 "p|precision", "NUM Max floating point precision. Implies --format-floats. Default: 9", &floatPrecisionOptionHandler, 152 std.getopt.config.caseSensitive, 153 "e|replace-empty", " Replace empty fields with '--'.", &replaceEmpty, 154 "E|empty-replacement", "STR Replace empty fields with a string.", &emptyReplacement, 155 std.getopt.config.caseInsensitive, 156 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 157 "s|space-between-fields", "NUM Spaces between each field (Default: 2)", &spaceBetweenFields, 158 "m|max-text-width", "NUM Max reserved field width for variable width text fields. Default: 40", &maxFieldPrintWidth, 159 "a|preamble", "NUM Treat the first NUM lines as a preamble and output them unchanged.", &preambleLines, 160 std.getopt.config.caseSensitive, 161 "V|version", " Print version information and exit.", &versionWanted, 162 std.getopt.config.caseInsensitive, 163 ); 164 165 if (r.helpWanted) 166 { 167 defaultGetoptPrinter(helpText, r.options); 168 return tuple(false, 0); 169 } 170 else if (helpVerbose) 171 { 172 defaultGetoptPrinter(helpTextVerbose, r.options); 173 return tuple(false, 0); 174 } 175 else if (versionWanted) 176 { 177 import tsvutils_version; 178 writeln(tsvutilsVersionNotice("tsv-pretty")); 179 return tuple(false, 0); 180 } 181 182 /* Validation and derivations. */ 183 if (noHeader && hasHeader) throw new Exception("Cannot specify both --H|header and --x|no-header."); 184 185 if (noHeader || hasHeader) autoDetectHeader = false; 186 187 /* Zero look-ahead has limited utility unless the first line is known to 188 * be a header. Good chance the user will get an unintended behavior. 189 */ 190 if (lookahead == 0 && autoDetectHeader) 191 { 192 assert (!noHeader && !hasHeader); 193 throw new Exception("Cannot auto-detect header with zero look-ahead. Specify either '--H|header' or '--x|no-header' when using '--l|lookahead 0'."); 194 } 195 196 if (emptyReplacement.length != 0) replaceEmpty = true; 197 else if (replaceEmpty) emptyReplacement = "--"; 198 199 if (emptyReplacement.length != 0) 200 { 201 emptyReplacementPrintWidth = emptyReplacement.monospacePrintWidth; 202 } 203 } 204 catch (Exception exc) 205 { 206 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 207 return tuple(false, 1); 208 } 209 return tuple(true, 0); 210 } 211 212 /* Option handler for --p|precision. It also sets --f|format-floats. */ 213 private void floatPrecisionOptionHandler(string option, string optionVal) @safe pure 214 { 215 import std.conv : to; 216 floatPrecision = optionVal.to!size_t; 217 formatFloats = true; 218 } 219 } 220 221 /** tsvPretty - Main loop, operating on input files and passing control to a 222 * TSVPrettyProccessor instance. This separates physical I/O sources and sinks 223 * from the underlying processing algorithm, which operates on generic ranges. 224 * 225 * A lockingTextWriter is created and released on every input line. This has 226 * effect flushing standard output every line, desirable in command line tools. 227 */ 228 void tsvPretty(in ref TsvPrettyOptions options, string[] files) 229 { 230 auto firstNonPreambleLine = options.preambleLines + 1; 231 auto tpp = TsvPrettyProcessor(options); 232 foreach (filename; (files.length > 0) ? files : ["-"]) 233 { 234 auto inputStream = (filename == "-") ? stdin : filename.File(); 235 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 236 { 237 if (lineNum < firstNonPreambleLine) 238 { 239 tpp.processPreambleLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 240 } 241 else if (lineNum == firstNonPreambleLine) 242 { 243 tpp.processFileFirstLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 244 } 245 else 246 { 247 tpp.processLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line); 248 } 249 } 250 } 251 tpp.finish(outputRangeObject!(char, char[])(stdout.lockingTextWriter)); 252 } 253 254 /** TsvPrettyProcessor - Maintains state of processing and exposes operations for 255 * processing individual input lines. 256 * 257 * TsvPrettyProcessor knows that input is file-based, but doesn't deal with actual 258 * files or reading lines from input. That is the job of the caller. Output is 259 * written to an output range. The caller is expected to pass each line to in the 260 * order received, that is an assumption built-into the its processing. 261 * 262 * In addition to the constructor, there are four API methods: 263 * * processPreambleLine - Called to process a preamble line occurring before 264 * the header line or first line of data. 265 * * processFileFirstLine - Called to process the first line of each file. This 266 * enables header processing. 267 * * processLine - Called to process all lines except for the first line a file. 268 * * finish - Called at the end of all processing. This is needed in case the 269 * look-ahead cache is still being filled when input terminates. 270 */ 271 272 struct TsvPrettyProcessor 273 { 274 import std.array : appender; 275 276 private: 277 private enum AutoDetectHeaderResult { none, hasHeader, noHeader }; 278 279 private TsvPrettyOptions _options; 280 private size_t _fileCount = 0; 281 private size_t _dataLineOutputCount = 0; 282 private bool _stillCaching = true; 283 private string _candidateHeaderLine; 284 private auto _lookaheadCache = appender!(string[])(); 285 private FieldFormat[] _fieldVector; 286 private AutoDetectHeaderResult _autoDetectHeaderResult = AutoDetectHeaderResult.none; 287 288 this(const TsvPrettyOptions options) @safe pure nothrow @nogc 289 { 290 _options = options; 291 if (options.noHeader && options.lookahead == 0) _stillCaching = false; 292 } 293 294 invariant 295 { 296 assert(_options.hasHeader || _options.noHeader || _options.autoDetectHeader); 297 assert((_options.lookahead == 0 && _lookaheadCache.data.length == 0) || 298 _lookaheadCache.data.length < _options.lookahead); 299 } 300 301 void processPreambleLine(OutputRange!char outputStream, const char[] line) 302 { 303 if (_fileCount == 0) 304 { 305 put(outputStream, line); 306 put(outputStream, '\n'); 307 } 308 } 309 310 void processFileFirstLine(OutputRange!char outputStream, const char[] line) 311 { 312 import std.conv : to; 313 314 _fileCount++; 315 316 if (_options.noHeader) 317 { 318 processLine(outputStream, line); 319 } 320 else if (_options.hasHeader) 321 { 322 if (_fileCount == 1) 323 { 324 setHeaderLine(line); 325 if (_options.lookahead == 0) outputLookaheadCache(outputStream); 326 } 327 } 328 else 329 { 330 assert(_options.autoDetectHeader); 331 332 final switch (_autoDetectHeaderResult) 333 { 334 case AutoDetectHeaderResult.noHeader: 335 assert(_fileCount > 1); 336 processLine(outputStream, line); 337 break; 338 339 case AutoDetectHeaderResult.hasHeader: 340 assert(_fileCount > 1); 341 break; 342 343 case AutoDetectHeaderResult.none: 344 if (_fileCount == 1) 345 { 346 assert(_candidateHeaderLine.length == 0); 347 _candidateHeaderLine = line.to!string; 348 } 349 else if (_fileCount == 2) 350 { 351 if (_candidateHeaderLine == line) 352 { 353 _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 354 setHeaderLine(_candidateHeaderLine); 355 356 /* Edge case: First file has only a header line and look-ahead set to zero. */ 357 if (_stillCaching && _options.lookahead == 0) outputLookaheadCache(outputStream); 358 } 359 else 360 { 361 _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; 362 updateFieldFormatsForLine(_candidateHeaderLine); 363 processLine(outputStream, line); 364 } 365 } 366 break; 367 } 368 } 369 } 370 371 void processLine(OutputRange!char outputStream, const char[] line) 372 { 373 if (_stillCaching) cacheDataLine(outputStream, line); 374 else outputDataLine(outputStream, line); 375 } 376 377 void finish(OutputRange!char outputStream) 378 { 379 if (_stillCaching) outputLookaheadCache(outputStream); 380 } 381 382 private: 383 /* outputLookaheadCache finalizes processing of the lookahead cache. This includes 384 * Setting the type and width of each field, finalizing the auto-detect header 385 * decision, and outputing all lines in the cache. 386 */ 387 void outputLookaheadCache(OutputRange!char outputStream) 388 { 389 import std.algorithm : splitter; 390 391 assert(_stillCaching); 392 393 if (_options.autoDetectHeader && 394 _autoDetectHeaderResult == AutoDetectHeaderResult.none && 395 _candidateHeaderLine.length != 0) 396 { 397 if (candidateHeaderLooksLikeHeader()) 398 { 399 _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader; 400 setHeaderLine(_candidateHeaderLine); 401 } 402 else 403 { 404 _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader; 405 } 406 } 407 408 409 if (_options.hasHeader || 410 (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) 411 { 412 finalizeFieldFormatting(); 413 outputHeader(outputStream); 414 } 415 else if (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.noHeader && 416 _candidateHeaderLine.length != 0) 417 { 418 updateFieldFormatsForLine(_candidateHeaderLine); 419 finalizeFieldFormatting(); 420 outputDataLine(outputStream, _candidateHeaderLine); 421 } 422 else 423 { 424 finalizeFieldFormatting(); 425 } 426 427 foreach(line; _lookaheadCache.data) outputDataLine(outputStream, line); 428 _lookaheadCache.clear; 429 _stillCaching = false; 430 } 431 432 bool candidateHeaderLooksLikeHeader() @safe 433 { 434 import std.algorithm : splitter; 435 436 /* The candidate header is declared as the header if the look-ahead cache has at least 437 * one numeric field that is text in the candidate header. 438 */ 439 foreach(fieldIndex, fieldValue; _candidateHeaderLine.splitter(_options.delim).enumerate) 440 { 441 auto candidateFieldFormat = FieldFormat(fieldIndex); 442 candidateFieldFormat.updateForFieldValue(fieldValue, _options); 443 if (_fieldVector.length > fieldIndex && 444 candidateFieldFormat.fieldType == FieldType.text && 445 (_fieldVector[fieldIndex].fieldType == FieldType.integer || 446 _fieldVector[fieldIndex].fieldType == FieldType.floatingPoint || 447 _fieldVector[fieldIndex].fieldType == FieldType.exponent)) 448 { 449 return true; 450 } 451 } 452 453 return false; 454 } 455 456 void setHeaderLine(const char[] line) @safe 457 { 458 import std.algorithm : splitter; 459 460 foreach(fieldIndex, header; line.splitter(_options.delim).enumerate) 461 { 462 if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 463 assert(_fieldVector.length > fieldIndex); 464 _fieldVector[fieldIndex].setHeader(header); 465 } 466 } 467 468 void cacheDataLine(OutputRange!char outputStream, const char[] line) 469 { 470 import std.conv : to; 471 472 assert(_lookaheadCache.data.length < _options.lookahead); 473 474 _lookaheadCache ~= line.to!string; 475 updateFieldFormatsForLine(line); 476 if (_lookaheadCache.data.length == _options.lookahead) outputLookaheadCache(outputStream); 477 } 478 479 void updateFieldFormatsForLine(const char[] line) @safe 480 { 481 import std.algorithm : splitter; 482 483 foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) 484 { 485 if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex); 486 assert(_fieldVector.length > fieldIndex); 487 _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); 488 } 489 490 } 491 492 void finalizeFieldFormatting() @safe pure @nogc nothrow 493 { 494 size_t nextFieldStart = 0; 495 foreach(ref field; _fieldVector) 496 { 497 nextFieldStart = field.finalizeFormatting(nextFieldStart, _options) + _options.spaceBetweenFields; 498 } 499 } 500 501 void outputHeader(OutputRange!char outputStream) 502 { 503 size_t nextOutputPosition = 0; 504 foreach(fieldIndex, ref field; _fieldVector.enumerate) 505 { 506 size_t spacesNeeded = field.startPosition - nextOutputPosition; 507 put(outputStream, repeat(" ", spacesNeeded)); 508 nextOutputPosition += spacesNeeded; 509 nextOutputPosition += field.writeHeader(outputStream, _options); 510 } 511 put(outputStream, '\n'); 512 513 if (_options.underlineHeader) 514 { 515 nextOutputPosition = 0; 516 foreach(fieldIndex, ref field; _fieldVector.enumerate) 517 { 518 size_t spacesNeeded = field.startPosition - nextOutputPosition; 519 put(outputStream, repeat(" ", spacesNeeded)); 520 nextOutputPosition += spacesNeeded; 521 nextOutputPosition += field.writeHeader!(Yes.writeUnderline)(outputStream, _options); 522 } 523 put(outputStream, '\n'); 524 } 525 } 526 527 void outputDataLine(OutputRange!char outputStream, const char[] line) 528 { 529 import std.algorithm : splitter; 530 531 /* Repeating header option. */ 532 if (_options.repeatHeader != 0 && _dataLineOutputCount != 0 && 533 (_options.hasHeader || (_options.autoDetectHeader && 534 _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) && 535 _dataLineOutputCount % _options.repeatHeader == 0) 536 { 537 put(outputStream, '\n'); 538 outputHeader(outputStream); 539 } 540 541 _dataLineOutputCount++; 542 543 size_t nextOutputPosition = 0; 544 foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate) 545 { 546 if (fieldIndex == _fieldVector.length) 547 { 548 /* Line is longer than any seen while caching. Add a new FieldFormat entry 549 * and set the line formatting based on this field value. 550 */ 551 _fieldVector ~= FieldFormat(fieldIndex); 552 size_t startPosition = (fieldIndex == 0) ? 553 0 : 554 _fieldVector[fieldIndex - 1].endPosition + _options.spaceBetweenFields; 555 556 _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options); 557 _fieldVector[fieldIndex].finalizeFormatting(startPosition, _options); 558 } 559 560 assert(fieldIndex < _fieldVector.length); 561 562 FieldFormat fieldFormat = _fieldVector[fieldIndex]; 563 size_t nextFieldStart = fieldFormat.startPosition; 564 size_t spacesNeeded = (nextOutputPosition < nextFieldStart) ? 565 nextFieldStart - nextOutputPosition : 566 (fieldIndex == 0) ? 0 : 1; // Previous field went long. One space between fields 567 568 put(outputStream, repeat(" ", spacesNeeded)); 569 nextOutputPosition += spacesNeeded; 570 nextOutputPosition += fieldFormat.writeFieldValue(outputStream, nextOutputPosition, fieldValue, _options); 571 } 572 put(outputStream, '\n'); 573 } 574 } 575 576 /** FieldFormat holds all the formatting info needed to format data values in a specific 577 * column. e.g. Field 1 may be text, field 2 may be a float, etc. This is calculated 578 * during the caching phase. Each FieldFormat instance is part of a vector representing 579 * the full row, so each includes the start position on the line and similar data. 580 * 581 * APIs used during the caching phase to gather field value samples 582 * - this - Initial construction. Takes the field index. 583 * - setHeader - Used to set the header text. 584 * - updateForFieldValue - Used to add the next field value sample. 585 * - finalizeFormatting - Used at the end of caching to finalize the format choices. 586 * 587 * APIs used after caching is finished (after finalizeFormatting): 588 * - startPosition - Returns the expected start position for the field. 589 * - endPosition - Returns the expected end position for the field. 590 * - writeHeader - Outputs the header, properly aligned. 591 * - writeFieldValue - Outputs the current field value, properly aligned. 592 */ 593 594 enum FieldType { unknown, text, integer, floatingPoint, exponent }; 595 enum FieldAlignment { left, right }; 596 597 struct FieldFormat 598 { 599 private: 600 size_t _fieldIndex; // Zero-based index in the line 601 string _header = ""; // Original field header 602 size_t _headerPrintWidth = 0; 603 FieldType _type = FieldType.unknown; 604 FieldAlignment _alignment = FieldAlignment.left; 605 size_t _startPosition = 0; 606 size_t _printWidth = 0; 607 size_t _precision = 0; // Number of digits after the decimal point 608 609 /* These are used while doing initial type and print format detection. */ 610 size_t _minRawPrintWidth = 0; 611 size_t _maxRawPrintWidth = 0; 612 size_t _maxDigitsBeforeDecimal = 0; 613 size_t _maxDigitsAfterDecimal = 0; 614 size_t _maxSignificantDigits = 0; // Digits to include in exponential notation 615 616 public: 617 this(size_t fieldIndex) @safe pure nothrow @nogc 618 { 619 _fieldIndex = fieldIndex; 620 } 621 622 /* setHeader is called to set the header text. */ 623 void setHeader(const char[] header) @safe 624 { 625 import std.conv : to; 626 627 _header = header.to!string; 628 _headerPrintWidth = _header.monospacePrintWidth; 629 } 630 631 size_t startPosition() nothrow pure @safe @property 632 { 633 return _startPosition; 634 } 635 636 size_t endPosition() nothrow pure @safe @property 637 { 638 return _startPosition + _printWidth; 639 } 640 641 FieldType fieldType() nothrow pure @safe @property 642 { 643 return _type; 644 } 645 646 /** writeHeader - Writes the field header or underline characters to the output stream. 647 * 648 * The current output position should have been written up to the field's start position, 649 * including any spaces between fields. Unlike data fields, there is no need to correct 650 * for previous fields that have run long. This routine does not output trailing spaces. 651 * This makes it simpler for lines to avoid unnecessary trailing spaces. 652 * 653 * Underlines can either be written the full width of the field or the just under the 654 * text of the header. At present this is a template parameter (compile-time). 655 * 656 * The print width of the output is returned. 657 */ 658 size_t writeHeader (Flag!"writeUnderline" writeUnderline = No.writeUnderline, 659 Flag!"fullWidthUnderline" fullWidthUnderline = No.fullWidthUnderline) 660 (OutputRange!char outputStream, in ref TsvPrettyOptions options) 661 { 662 import std.range : repeat; 663 664 size_t positionsWritten = 0; 665 if (_headerPrintWidth > 0) 666 { 667 static if (writeUnderline) 668 { 669 static if (fullWidthUnderline) 670 { 671 put(outputStream, repeat("-", _printWidth)); 672 positionsWritten += _printWidth; 673 } 674 else // Underline beneath the header text only 675 { 676 if (_alignment == FieldAlignment.right) 677 { 678 put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 679 positionsWritten += _printWidth - _headerPrintWidth; 680 } 681 put(outputStream, repeat("-", _headerPrintWidth)); 682 positionsWritten += _headerPrintWidth; 683 } 684 } 685 else 686 { 687 if (_alignment == FieldAlignment.right) 688 { 689 put(outputStream, repeat(" ", _printWidth - _headerPrintWidth)); 690 positionsWritten += _printWidth - _headerPrintWidth; 691 } 692 put(outputStream, _header); 693 positionsWritten += _headerPrintWidth; 694 } 695 } 696 return positionsWritten; 697 } 698 699 /* writeFieldValue writes the field value for the current column The caller needs 700 * to generate output at least to the column's start position, but can go beyond 701 * if previous fields have run long. 702 * 703 * The field value is aligned properly in the field. Either left aligned (text) or 704 * right aligned (numeric). Floating point fields are both right aligned and 705 * decimal point aligned. The number of bytes written is returned. Trailing spaces 706 * are not added, the caller must add any necessary trailing spaces prior to 707 * printing the next field. 708 */ 709 size_t writeFieldValue(OutputRange!char outputStream, size_t currPosition, 710 const char[] fieldValue, in ref TsvPrettyOptions options) 711 in 712 { 713 assert(currPosition >= _startPosition); // Caller resposible for advancing to field start position. 714 assert(_type == FieldType.text || _type == FieldType.integer || 715 _type == FieldType.floatingPoint || _type == FieldType.exponent); 716 } 717 body 718 { 719 import std.algorithm : find, max, min; 720 import std.conv : to, ConvException; 721 import std.format : format; 722 723 /* Create the print version of the string. Either the raw value or a formatted 724 * version of a float. 725 */ 726 string printValue; 727 if (!options.formatFloats || _type == FieldType.text || _type == FieldType.integer) 728 { 729 printValue = fieldValue.to!string; 730 } 731 else 732 { 733 assert(options.formatFloats); 734 assert(_type == FieldType.exponent || _type == FieldType.floatingPoint); 735 736 if (_type == FieldType.exponent) 737 { 738 printValue = fieldValue.formatExponentValue(_precision); 739 } 740 else 741 { 742 printValue = fieldValue.formatFloatingPointValue(_precision); 743 } 744 } 745 746 if (printValue.length == 0 && options.replaceEmpty) printValue = options.emptyReplacement; 747 size_t printValuePrintWidth = printValue.monospacePrintWidth; 748 749 /* Calculate leading spaces needed for right alignment. */ 750 size_t leadingSpaces = 0; 751 if (_alignment == FieldAlignment.right) 752 { 753 /* Target width adjusts the column width to account for overrun by the previous field. */ 754 size_t targetWidth; 755 if (currPosition == _startPosition) 756 { 757 targetWidth = _printWidth; 758 } 759 else 760 { 761 size_t startGap = currPosition - _startPosition; 762 targetWidth = max(printValuePrintWidth, 763 startGap < _printWidth ? _printWidth - startGap : 0); 764 } 765 766 leadingSpaces = (printValuePrintWidth < targetWidth) ? 767 targetWidth - printValuePrintWidth : 0; 768 769 /* The above calculation assumes the print value is fully right aligned. 770 * This is not correct when raw value floats are being used rather than 771 * formatted floats, as different values will have different precision. 772 * The next adjustment accounts for this, dropping leading spaces as 773 * needed to align the decimal point. Note that text and exponential 774 * values get aligned strictly against right boundaries. 775 */ 776 if (leadingSpaces > 0 && _precision > 0 && 777 _type == FieldType.floatingPoint && !options.formatFloats) 778 { 779 import std.algorithm : canFind, findSplit; 780 import std.string : isNumeric; 781 782 if (printValue.isNumeric && !printValue.canFind!(x => x == 'e' || x == 'E')) 783 { 784 size_t decimalAndDigitsLength = printValue.find(".").length; 785 size_t trailingSpaces = 786 (decimalAndDigitsLength == 0) ? _precision + 1 : 787 (decimalAndDigitsLength > _precision) ? 0 : 788 _precision + 1 - decimalAndDigitsLength; 789 790 leadingSpaces = (leadingSpaces > trailingSpaces) ? 791 leadingSpaces - trailingSpaces : 0; 792 } 793 } 794 } 795 put(outputStream, repeat(' ', leadingSpaces)); 796 put(outputStream, printValue); 797 return printValuePrintWidth + leadingSpaces; 798 } 799 800 /** updateForFieldValue updates type and format given a new field value. 801 * 802 * This is called during look-ahead caching to register a new sample value for the 803 * column. The key components updates are field type and print width. 804 */ 805 void updateForFieldValue(const char[] fieldValue, in ref TsvPrettyOptions options) @safe 806 { 807 import std.algorithm : findAmong, findSplit, max, min; 808 import std.conv : to, ConvException; 809 import std.string : isNumeric; 810 811 size_t fieldValuePrintWidth = fieldValue.monospacePrintWidth; 812 size_t fieldValuePrintWidthWithEmpty = 813 (fieldValuePrintWidth == 0 && options.replaceEmpty) ? 814 options.emptyReplacementPrintWidth : 815 fieldValuePrintWidth; 816 817 _maxRawPrintWidth = max(_maxRawPrintWidth, fieldValuePrintWidthWithEmpty); 818 _minRawPrintWidth = (_minRawPrintWidth == 0) ? 819 fieldValuePrintWidthWithEmpty : 820 min(_minRawPrintWidth, fieldValuePrintWidthWithEmpty); 821 822 if (_type == FieldType.text) 823 { 824 /* Already text, can't become anything else. */ 825 } 826 else if (fieldValuePrintWidth == 0) 827 { 828 /* Don't let an empty field override a numeric field type. */ 829 } 830 else if (!fieldValue.isNumeric) 831 { 832 /* Not parsable as a number. Switch from unknown or numeric type to text. */ 833 _type = FieldType.text; 834 } 835 else 836 { 837 /* Field type is currently unknown or numeric, and current field parses as numeric. 838 * See if it parses as integer or float. Integers will parse as floats, so try 839 * integer types first. 840 */ 841 FieldType parsesAs = FieldType.unknown; 842 long longValue; 843 ulong ulongValue; 844 double doubleValue; 845 try 846 { 847 longValue = fieldValue.to!long; 848 parsesAs = FieldType.integer; 849 } 850 catch (ConvException) 851 { 852 try 853 { 854 ulongValue = fieldValue.to!ulong; 855 parsesAs = FieldType.integer; 856 } 857 catch (ConvException) 858 { 859 try 860 { 861 doubleValue = fieldValue.to!double; 862 import std.algorithm : findAmong; 863 parsesAs = (fieldValue.findAmong("eE").length == 0) ? 864 FieldType.floatingPoint : FieldType.exponent; 865 } 866 catch (ConvException) 867 { 868 /* Note: This means isNumeric thinks it's a number, but conversions all failed. */ 869 parsesAs = FieldType.text; 870 } 871 } 872 } 873 874 if (parsesAs == FieldType.text) 875 { 876 /* Not parsable as a number (despite isNumeric result). Switch to text type. */ 877 _type = FieldType.text; 878 } 879 else if (parsesAs == FieldType.exponent) 880 { 881 /* Exponential notion supersedes both vanilla floats and integers. */ 882 _type = FieldType.exponent; 883 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 884 885 if (auto decimalSplit = fieldValue.findSplit(".")) 886 { 887 auto fromExponent = decimalSplit[2].findAmong("eE"); 888 size_t numDigitsAfterDecimal = decimalSplit[2].length - fromExponent.length; 889 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 890 _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, numDigitsAfterDecimal); 891 } 892 else 893 { 894 /* Exponent without a decimal point. */ 895 auto fromExponent = fieldValue.findAmong("eE"); 896 assert(fromExponent.length > 0); 897 size_t numDigits = fieldValue.length - fromExponent.length; 898 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, numDigits); 899 } 900 } 901 else if (parsesAs == FieldType.floatingPoint) 902 { 903 /* Floating point supercedes integer but not exponential. */ 904 if (_type != FieldType.exponent) _type = FieldType.floatingPoint; 905 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 906 907 if (auto decimalSplit = fieldValue.findSplit(".")) 908 { 909 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length); 910 _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, decimalSplit[2].length); 911 } 912 } 913 else 914 { 915 assert(parsesAs == FieldType.integer); 916 if (_type != FieldType.floatingPoint) _type = FieldType.integer; 917 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits); 918 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, fieldValue.length); 919 } 920 } 921 } 922 923 /* finalizeFormatting updates field formatting info based on the current state. It is 924 * expected to be called after adding field entries via updateForFieldValue(). It 925 * returns its new end position. 926 */ 927 size_t finalizeFormatting (size_t startPosition, in ref TsvPrettyOptions options) @safe pure @nogc nothrow 928 { 929 import std.algorithm : max, min; 930 _startPosition = startPosition; 931 if (_type == FieldType.unknown) _type = FieldType.text; 932 _alignment = (_type == FieldType.integer || _type == FieldType.floatingPoint 933 || _type == FieldType.exponent) ? 934 FieldAlignment.right : 935 FieldAlignment.left; 936 937 if (_type == FieldType.floatingPoint) 938 { 939 size_t precision = min(options.floatPrecision, _maxDigitsAfterDecimal); 940 size_t maxValueWidth = _maxDigitsBeforeDecimal + precision; 941 if (precision > 0) maxValueWidth++; // Account for the decimal point. 942 _printWidth = max(1, _headerPrintWidth, maxValueWidth); 943 _precision = precision; 944 } 945 else if (_type == FieldType.exponent) 946 { 947 size_t maxPrecision = (_maxSignificantDigits > 0) ? _maxSignificantDigits - 1 : 0; 948 _precision = min(options.floatPrecision, maxPrecision); 949 950 size_t maxValuePrintWidth = !options.formatFloats ? _maxRawPrintWidth : _precision + 7; 951 _printWidth = max(1, _headerPrintWidth, maxValuePrintWidth); 952 } 953 else if (_type == FieldType.integer) 954 { 955 _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, _maxRawPrintWidth); 956 _precision = 0; 957 } 958 else 959 { 960 _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, 961 min(options.maxFieldPrintWidth, _maxRawPrintWidth)); 962 _precision = 0; 963 } 964 965 return _startPosition + _printWidth; 966 } 967 } 968 969 /** formatFloatingPointValue - Returns the printed representation of a raw value 970 * formatted as a fixed precision floating number. This includes zero padding or 971 * truncation of trailing digits as necessary to meet the desired precision. 972 * 973 * If the value cannot be interpreted as a double then the raw value is returned. 974 * Similarly, values in exponential notion are returned without reformatting. 975 * 976 * This routine is used to format values in columns identified as floating point. 977 */ 978 string formatFloatingPointValue(const char[] value, size_t precision) @safe 979 { 980 import std.algorithm : canFind, find; 981 import std.array : join; 982 import std.conv : to, ConvException; 983 import std.format : format; 984 import std.math : isFinite; 985 import std.range : repeat; 986 987 string printValue; 988 989 if (value.canFind!(x => x == 'e' || x == 'E')) 990 { 991 /* Exponential notion. Use the raw value. */ 992 printValue = value.to!string; 993 } 994 else 995 { 996 try 997 { 998 double doubleValue = value.to!double; 999 if (doubleValue.isFinite) 1000 { 1001 size_t numPrecisionDigits = value.precisionDigits; 1002 if (numPrecisionDigits >= precision) 1003 { 1004 printValue = format("%.*f", precision, doubleValue); 1005 } 1006 else if (numPrecisionDigits == 0) 1007 { 1008 printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ "." ~ repeat("0", precision).join; 1009 } 1010 else 1011 { 1012 printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ repeat("0", precision - numPrecisionDigits).join; 1013 } 1014 } 1015 else printValue = value.to!string; // NaN or Infinity 1016 } 1017 catch (ConvException) printValue = value.to!string; 1018 } 1019 return printValue; 1020 } 1021 1022 @safe unittest 1023 { 1024 assert("".formatFloatingPointValue(3) == ""); 1025 assert(" ".formatFloatingPointValue(3) == " "); 1026 assert("abc".formatFloatingPointValue(3) == "abc"); 1027 assert("nan".formatFloatingPointValue(3) == "nan"); 1028 assert("0".formatFloatingPointValue(0) == "0"); 1029 assert("1".formatFloatingPointValue(0) == "1"); 1030 assert("1.".formatFloatingPointValue(0) == "1"); 1031 assert("1".formatFloatingPointValue(3) == "1.000"); 1032 assert("1000".formatFloatingPointValue(3) == "1000.000"); 1033 assert("1000.001".formatFloatingPointValue(5) == "1000.00100"); 1034 assert("1000.001".formatFloatingPointValue(3) == "1000.001"); 1035 assert("1000.001".formatFloatingPointValue(2) == "1000.00"); 1036 assert("1000.006".formatFloatingPointValue(2) == "1000.01"); 1037 assert("-0.1".formatFloatingPointValue(1) == "-0.1"); 1038 assert("-0.1".formatFloatingPointValue(3) == "-0.100"); 1039 assert("-0.001".formatFloatingPointValue(3) == "-0.001"); 1040 assert("-0.006".formatFloatingPointValue(2) == "-0.01"); 1041 assert("-0.001".formatFloatingPointValue(1) == "-0.0"); 1042 assert("-0.001".formatFloatingPointValue(0) == "-0"); 1043 assert("0e+00".formatFloatingPointValue(0) == "0e+00"); 1044 assert("0.00e+00".formatFloatingPointValue(0) == "0.00e+00"); 1045 assert("1e+06".formatFloatingPointValue(1) == "1e+06"); 1046 assert("1e+06".formatFloatingPointValue(2) == "1e+06"); 1047 assert("1E-06".formatFloatingPointValue(1) == "1E-06"); 1048 assert("1.1E+6".formatFloatingPointValue(2) == "1.1E+6"); 1049 assert("1.1E+100".formatFloatingPointValue(2) == "1.1E+100"); 1050 } 1051 1052 /** formatExponentValue - Returns the printed representation of a raw value formatted 1053 * using exponential notation and a specific precision. If the value cannot be interpreted 1054 * as a double then the a copy of the original value is returned. 1055 * 1056 * This routine is used to format values in columns identified as having exponent format. 1057 */ 1058 string formatExponentValue(const char[] value, size_t precision) @safe 1059 { 1060 import std.algorithm : canFind, find, findSplit; 1061 import std.array : join; 1062 import std.conv : to, ConvException; 1063 import std.format : format; 1064 import std.math : isFinite; 1065 import std.range : repeat; 1066 1067 string printValue; 1068 try 1069 { 1070 double doubleValue = value.to!double; 1071 if (doubleValue.isFinite) 1072 { 1073 size_t numSignificantDigits = value.significantDigits; 1074 size_t numPrecisionDigits = (numSignificantDigits == 0) ? 0 : numSignificantDigits - 1; 1075 if (numPrecisionDigits >= precision) 1076 { 1077 printValue = format("%.*e", precision, doubleValue); 1078 } 1079 else 1080 { 1081 string unpaddedPrintValue = format("%.*e", numPrecisionDigits, doubleValue); 1082 auto exponentSplit = unpaddedPrintValue.findSplit("e"); // Uses the same exponent case as format call. 1083 if (numPrecisionDigits == 0) 1084 { 1085 assert(precision != 0); 1086 assert(!exponentSplit[0].canFind(".")); 1087 printValue = exponentSplit[0] ~ "." ~ repeat("0", precision).join ~ exponentSplit[1] ~ exponentSplit[2]; 1088 } 1089 else 1090 { 1091 printValue = exponentSplit[0] ~ repeat("0", precision - numPrecisionDigits).join ~ exponentSplit[1] ~ exponentSplit[2]; 1092 } 1093 } 1094 } 1095 else printValue = value.to!string; // NaN or Infinity 1096 } 1097 catch (ConvException) printValue = value.to!string; 1098 1099 return printValue; 1100 } 1101 1102 @safe unittest 1103 { 1104 assert("".formatExponentValue(3) == ""); 1105 assert(" ".formatExponentValue(3) == " "); 1106 assert("abc".formatExponentValue(3) == "abc"); 1107 assert("nan".formatExponentValue(3) == "nan"); 1108 assert("0".formatExponentValue(0) == "0e+00"); 1109 assert("1".formatExponentValue(0) == "1e+00"); 1110 assert("1.".formatExponentValue(0) == "1e+00"); 1111 assert("1".formatExponentValue(3) == "1.000e+00"); 1112 assert("1000".formatExponentValue(3) == "1.000e+03"); 1113 assert("1000.001".formatExponentValue(5) == "1.00000e+03"); 1114 assert("1000.001".formatExponentValue(3) == "1.000e+03"); 1115 assert("1000.001".formatExponentValue(6) == "1.000001e+03"); 1116 assert("1000.006".formatExponentValue(5) == "1.00001e+03"); 1117 assert("-0.1".formatExponentValue(1) == "-1.0e-01"); 1118 assert("-0.1".formatExponentValue(3) == "-1.000e-01"); 1119 assert("-0.001".formatExponentValue(3) == "-1.000e-03"); 1120 assert("-0.001".formatExponentValue(1) == "-1.0e-03"); 1121 assert("-0.001".formatExponentValue(0) == "-1e-03"); 1122 assert("0e+00".formatExponentValue(0) == "0e+00"); 1123 assert("0.00e+00".formatExponentValue(0) == "0e+00"); 1124 assert("1e+06".formatExponentValue(1) == "1.0e+06"); 1125 assert("1e+06".formatExponentValue(2) == "1.00e+06"); 1126 assert("1.0001e+06".formatExponentValue(1) == "1.0e+06"); 1127 assert("1.0001e+06".formatExponentValue(5) == "1.00010e+06"); 1128 } 1129 1130 /** significantDigits - Returns the number of significant digits in a numeric string. 1131 * 1132 * Significant digits are those needed to represent a number in exponential notation. 1133 * Examples: 1134 * 22.345 - 5 digits 1135 * 10.010 - 4 digits 1136 * 0.0032 - 2 digits 1137 */ 1138 size_t significantDigits(const char[] numericString) @safe pure 1139 { 1140 import std.algorithm : canFind, find, findAmong, findSplit, stripRight; 1141 import std.ascii : isDigit; 1142 import std.math : isFinite; 1143 import std.string : isNumeric; 1144 import std.conv : to; 1145 assert (numericString.isNumeric); 1146 1147 size_t significantDigits = 0; 1148 if (numericString.to!double.isFinite) 1149 { 1150 auto digitsPart = numericString.find!(x => x.isDigit && x != '0'); 1151 auto exponentPart = digitsPart.findAmong("eE"); 1152 digitsPart = digitsPart[0 .. $ - exponentPart.length]; 1153 1154 if (digitsPart.canFind('.')) 1155 { 1156 digitsPart = digitsPart.stripRight('0'); 1157 significantDigits = digitsPart.length - 1; 1158 } 1159 else 1160 { 1161 significantDigits = digitsPart.length; 1162 } 1163 1164 if (significantDigits == 0) significantDigits = 1; 1165 } 1166 1167 return significantDigits; 1168 } 1169 1170 @safe pure unittest 1171 { 1172 assert("0".significantDigits == 1); 1173 assert("10".significantDigits == 2); 1174 assert("0.0".significantDigits == 1); 1175 assert("-10.0".significantDigits == 2); 1176 assert("-.01".significantDigits == 1); 1177 assert("-.5401".significantDigits == 4); 1178 assert("1010.010".significantDigits == 6); 1179 assert("0.0003003".significantDigits == 4); 1180 assert("6e+06".significantDigits == 1); 1181 assert("6.0e+06".significantDigits == 1); 1182 assert("6.5e+06".significantDigits == 2); 1183 assert("6.005e+06".significantDigits == 4); 1184 } 1185 1186 /* precisionDigits - Returns the number of digits to the right of the decimal point in 1187 * a numeric string. This routine includes trailing zeros in the count. 1188 */ 1189 size_t precisionDigits(const char[] numericString) @safe pure 1190 { 1191 import std.algorithm : canFind, find, findAmong, findSplit, stripRight; 1192 import std.ascii : isDigit; 1193 import std.math : isFinite; 1194 import std.string : isNumeric; 1195 import std.conv : to; 1196 assert (numericString.isNumeric); 1197 1198 size_t precisionDigits = 0; 1199 if (numericString.to!double.isFinite) 1200 { 1201 if (auto decimalSplit = numericString.findSplit(".")) 1202 { 1203 auto exponentPart = decimalSplit[2].findAmong("eE"); 1204 precisionDigits = decimalSplit[2].length - exponentPart.length; 1205 } 1206 } 1207 1208 return precisionDigits; 1209 } 1210 1211 @safe pure unittest 1212 { 1213 assert("0".precisionDigits == 0); 1214 assert("10".precisionDigits == 0); 1215 assert("0.0".precisionDigits == 1); 1216 assert("-10.0".precisionDigits == 1); 1217 assert("-.01".precisionDigits == 2); 1218 assert("-.5401".precisionDigits == 4); 1219 } 1220 1221 /** monospacePrintWidth - Calculates the expected print width of a string in monospace 1222 * (fixed-width) fonts. 1223 */ 1224 size_t monospacePrintWidth(const char[] str) @safe nothrow 1225 { 1226 bool isCJK(dchar c) 1227 { 1228 return c >= '\u3000' && c <= '\u9fff'; 1229 } 1230 1231 import std.uni : byGrapheme; 1232 1233 size_t width = 0; 1234 try foreach (g; str.byGrapheme) width += isCJK(g[0]) ? 2 : 1; 1235 catch (Exception) width = str.length; // Invalid utf-8 sequence. Catch avoids program failure. 1236 1237 return width; 1238 } 1239 1240 unittest 1241 { 1242 assert("".monospacePrintWidth == 0); 1243 assert(" ".monospacePrintWidth == 1); 1244 assert("abc".monospacePrintWidth == 3); 1245 assert("林檎".monospacePrintWidth == 4); 1246 assert("æble".monospacePrintWidth == 4); 1247 assert("ვაშლი".monospacePrintWidth == 5); 1248 assert("größten".monospacePrintWidth == 7); 1249 }