tsv_pretty source code

1 /**
2 Command line tool that prints TSV data aligned for easier reading on consoles
3 and traditional command-line environments.
4 
5 Copyright (c) 2017-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_pretty;
11 
12 import std.range;
13 import std.stdio;
14 import std.typecons : Flag, Yes, No, tuple;
15 
16 version(unittest)
17 {
18     // When running unit tests, use main from -main compiler switch.
19 }
20 else
21 {
22     int main(string[] cmdArgs)
23     {
24         /* When running in DMD code coverage mode, turn on report merging. */
25         version(D_Coverage) version(DigitalMars)
26         {
27             import core.runtime : dmd_coverSetMerge;
28             dmd_coverSetMerge(true);
29         }
30 
31         TsvPrettyOptions options;
32         auto r = options.processArgs(cmdArgs);
33         if (!r[0]) return r[1];
34         try tsvPretty(options, cmdArgs[1 .. $]);
35         catch (Exception exc)
36         {
37             stderr.writefln("Error [%s]: %s", options.programName, exc.msg);
38             return 1;
39         }
40         return 0;
41     }
42 }
43 
44 auto helpTextVerbose = q"EOS
45 Synopsis: tsv-pretty [options] [file...]
46 
47 tsv-pretty outputs TSV data in a format intended to be more human readable when
48 working on the command line. This is done primarily by lining up data into
49 fixed-width columns. Text is left aligned, numbers are right aligned. Floating
50 points numbers are aligned on the decimal point when feasible.
51 
52 Processing begins by reading the initial set of lines into memory to determine
53 the field widths and data types of each column. This look-ahead buffer is used
54 for header detection as well. Output begins after this processing is complete.
55 
56 By default, only the alignment is changed, the actual values are not modified.
57 Several of the formatting options do modify the values.
58 
59 Features:
60 
61 * Floating point numbers: Floats can be printed in fixed-width precision, using
62   the same precision for all floats in a column. This makes then line up nicely.
63   Precision is determined by values seen during look-ahead processing. The max
64   precision defaults to 9, this can be changed when smaller or larger values are
65   desired. See the '--f|format-floats' and '--p|precision' options.
66 
67 * Header lines: Headers are detected automatically when possible. This can be
68   overridden when automatic detection doesn't work as desired. Headers can be
69   underlined and repeated at regular intervals.
70 
71 * Missing values: A substitute value can be used for empty fields. This is often
72   less confusing than spaces. See '--e|replace-empty' and '--E|empty-replacement'.
73 
74 * Exponential notion: As part float formatting, '--f|format-floats' re-formats
75   columns where exponential notation is found so all the values in the column
76   are displayed using exponential notation with the same precision.
77 
78 * Preamble: A number of initial lines can be designated as a preamble and output
79   unchanged. The preamble is before the header, if a header is present.
80 
81 * Fonts: Fixed-width fonts are assumed. CJK characters are assumed to be double
82   width. This is not always correct, but works well in most cases.
83 
84 Options:
85 EOS";
86 
87 auto helpText = q"EOS
88 Synopsis: tsv-pretty [options] [file...]
89 
90 tsv-pretty outputs TSV data in a more human readable format. This is done by lining
91 up data into fixed-width columns. Text is left aligned, numbers are right aligned.
92 Floating points numbers are aligned on the decimal point when feasible.
93 
94 Options:
95 EOS";
96 
97 /* TsvPrettyOptions is used to process and store command line options. */
98 struct TsvPrettyOptions
99 {
100     string programName;
101     bool helpVerbose = false;           // --help-verbose
102     bool hasHeader = false;             // --H|header (Note: Default false assumed by validation code)
103     bool autoDetectHeader = true;       // Derived (Note: Default true assumed by validation code)
104     bool noHeader = false;              // --x|no-header (Note: Default false assumed by validation code)
105     size_t lookahead = 1000;            // --l|lookahead
106     size_t repeatHeader = 0;            // --r|repeat-header num (zero means no repeat)
107     bool underlineHeader = false;       // --u|underline-header
108     bool formatFloats = false;          // --f|format-floats
109     size_t floatPrecision = 9;          // --p|precision num (max precision when formatting floats.)
110     bool replaceEmpty = false;          // --e|replace-empty
111     string emptyReplacement = "";       // --E|empty-replacement
112     size_t emptyReplacementPrintWidth = 0;    // Derived
113     char delim = '\t';                  // --d|delimiter
114     size_t spaceBetweenFields = 2;      // --s|space-between-fields num
115     size_t maxFieldPrintWidth = 40;     // --m|max-text-width num; Max width for variable width text fields.
116     size_t preambleLines = 0;           // --a|preamble; Number of preamble lines.
117     bool versionWanted = false;         // --V|version
118 
119     /* Returns a tuple. First value is true if command line arguments were successfully
120      * processed and execution should continue, or false if an error occurred or the user
121      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
122      *
123      * Returning true (execution continues) means args have been validated and derived
124      * values calculated. In addition, field indices have been converted to zero-based.
125      * If the whole line is the key, the individual fields list will be cleared.
126      */
127     auto processArgs (ref string[] cmdArgs)
128     {
129         import std.algorithm : any, each;
130         import std.getopt;
131         import std.path : baseName, stripExtension;
132 
133         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
134 
135         try
136         {
137             arraySep = ",";    // Use comma to separate values in command line options
138             auto r = getopt(
139                 cmdArgs,
140                 "help-verbose",           "       Print full help.", &helpVerbose,
141                 std.getopt.config.caseSensitive,
142                 "H|header",               "       Treat the first line of each file as a header.", &hasHeader,
143                 std.getopt.config.caseInsensitive,
144                 "x|no-header",            "       Assume no header. Turns off automatic header detection.", &noHeader,
145                 "l|lookahead",            "NUM    Lines to read to interpret data before generating output. Default: 1000", &lookahead,
146 
147                 "r|repeat-header",        "NUM    Lines to print before repeating the header. Default: No repeating header", &repeatHeader,
148 
149                 "u|underline-header",     "       Underline the header.", &underlineHeader,
150                 "f|format-floats",        "       Format floats for better readability. Default: No", &formatFloats,
151                 "p|precision",            "NUM    Max floating point precision. Implies --format-floats. Default: 9", &floatPrecisionOptionHandler,
152                 std.getopt.config.caseSensitive,
153                 "e|replace-empty",        "       Replace empty fields with '--'.", &replaceEmpty,
154                 "E|empty-replacement",    "STR    Replace empty fields with a string.", &emptyReplacement,
155                 std.getopt.config.caseInsensitive,
156                 "d|delimiter",            "CHR    Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
157                 "s|space-between-fields", "NUM    Spaces between each field (Default: 2)", &spaceBetweenFields,
158                 "m|max-text-width",       "NUM    Max reserved field width for variable width text fields. Default: 40", &maxFieldPrintWidth,
159                 "a|preamble",             "NUM    Treat the first NUM lines as a preamble and output them unchanged.", &preambleLines,
160                 std.getopt.config.caseSensitive,
161                 "V|version",              "       Print version information and exit.", &versionWanted,
162                 std.getopt.config.caseInsensitive,
163                 );
164 
165             if (r.helpWanted)
166             {
167                 defaultGetoptPrinter(helpText, r.options);
168                 return tuple(false, 0);
169             }
170             else if (helpVerbose)
171             {
172                 defaultGetoptPrinter(helpTextVerbose, r.options);
173                 return tuple(false, 0);
174             }
175             else if (versionWanted)
176             {
177                 import tsvutils_version;
178                 writeln(tsvutilsVersionNotice("tsv-pretty"));
179                 return tuple(false, 0);
180             }
181 
182             /* Validation and derivations. */
183             if (noHeader && hasHeader) throw new Exception("Cannot specify both --H|header and --x|no-header.");
184 
185             if (noHeader || hasHeader) autoDetectHeader = false;
186 
187             /* Zero look-ahead has limited utility unless the first line is known to
188              * be a header. Good chance the user will get an unintended behavior.
189              */
190             if (lookahead == 0 && autoDetectHeader)
191             {
192                 assert (!noHeader && !hasHeader);
193                 throw new Exception("Cannot auto-detect header with zero look-ahead. Specify either '--H|header' or '--x|no-header' when using '--l|lookahead 0'.");
194             }
195 
196             if (emptyReplacement.length != 0) replaceEmpty = true;
197             else if (replaceEmpty) emptyReplacement = "--";
198 
199             if (emptyReplacement.length != 0)
200             {
201                 emptyReplacementPrintWidth = emptyReplacement.monospacePrintWidth;
202             }
203         }
204         catch (Exception exc)
205         {
206             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
207             return tuple(false, 1);
208         }
209         return tuple(true, 0);
210     }
211 
212     /* Option handler for --p|precision. It also sets --f|format-floats. */
213     private void floatPrecisionOptionHandler(string option, string optionVal) @safe pure
214     {
215         import std.conv : to;
216         floatPrecision = optionVal.to!size_t;
217         formatFloats = true;
218     }
219 }
220 
221 /** tsvPretty - Main loop, operating on input files and passing control to a
222  * TSVPrettyProccessor instance. This separates physical I/O sources and sinks
223  * from the underlying processing algorithm, which operates on generic ranges.
224  *
225  * A lockingTextWriter is created and released on every input line. This has
226  * effect flushing standard output every line, desirable in command line tools.
227  */
228 void tsvPretty(in ref TsvPrettyOptions options, string[] files)
229 {
230     auto firstNonPreambleLine = options.preambleLines + 1;
231     auto tpp = TsvPrettyProcessor(options);
232     foreach (filename; (files.length > 0) ? files : ["-"])
233     {
234         auto inputStream = (filename == "-") ? stdin : filename.File();
235         foreach (lineNum, line; inputStream.byLine.enumerate(1))
236         {
237             if (lineNum < firstNonPreambleLine)
238             {
239                 tpp.processPreambleLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line);
240             }
241             else if (lineNum == firstNonPreambleLine)
242             {
243                 tpp.processFileFirstLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line);
244             }
245             else
246             {
247                 tpp.processLine(outputRangeObject!(char, char[])(stdout.lockingTextWriter), line);
248             }
249         }
250     }
251     tpp.finish(outputRangeObject!(char, char[])(stdout.lockingTextWriter));
252 }
253 
254 /** TsvPrettyProcessor - Maintains state of processing and exposes operations for
255  * processing individual input lines.
256  *
257  * TsvPrettyProcessor knows that input is file-based, but doesn't deal with actual
258  * files or reading lines from input. That is the job of the caller. Output is
259  * written to an output range. The caller is expected to pass each line to in the
260  * order received, that is an assumption built-into the its processing.
261  *
262  * In addition to the constructor, there are four API methods:
263  *   * processPreambleLine - Called to process a preamble line occurring before
264  *     the header line or first line of data.
265  *   * processFileFirstLine - Called to process the first line of each file. This
266  *     enables header processing.
267  *   * processLine - Called to process all lines except for the first line a file.
268  *   * finish - Called at the end of all processing. This is needed in case the
269  *     look-ahead cache is still being filled when input terminates.
270  */
271 
272 struct TsvPrettyProcessor
273 {
274     import std.array : appender;
275 
276 private:
277     private enum AutoDetectHeaderResult { none, hasHeader, noHeader };
278 
279     private TsvPrettyOptions _options;
280     private size_t _fileCount = 0;
281     private size_t _dataLineOutputCount = 0;
282     private bool _stillCaching = true;
283     private string _candidateHeaderLine;
284     private auto _lookaheadCache = appender!(string[])();
285     private FieldFormat[] _fieldVector;
286     private AutoDetectHeaderResult _autoDetectHeaderResult = AutoDetectHeaderResult.none;
287 
288     this(const TsvPrettyOptions options) @safe pure nothrow @nogc
289     {
290         _options = options;
291         if (options.noHeader && options.lookahead == 0) _stillCaching = false;
292     }
293 
294     invariant
295     {
296         assert(_options.hasHeader || _options.noHeader || _options.autoDetectHeader);
297         assert((_options.lookahead == 0 && _lookaheadCache.data.length == 0) ||
298                _lookaheadCache.data.length < _options.lookahead);
299     }
300 
301     void processPreambleLine(OutputRange!char outputStream, const char[] line)
302     {
303         if (_fileCount == 0)
304         {
305             put(outputStream, line);
306             put(outputStream, '\n');
307         }
308     }
309 
310     void processFileFirstLine(OutputRange!char outputStream, const char[] line)
311     {
312         import std.conv : to;
313 
314         _fileCount++;
315 
316         if (_options.noHeader)
317         {
318             processLine(outputStream, line);
319         }
320         else if (_options.hasHeader)
321         {
322             if (_fileCount == 1)
323             {
324                 setHeaderLine(line);
325                 if (_options.lookahead == 0) outputLookaheadCache(outputStream);
326             }
327         }
328         else
329         {
330             assert(_options.autoDetectHeader);
331 
332             final switch (_autoDetectHeaderResult)
333             {
334             case AutoDetectHeaderResult.noHeader:
335                 assert(_fileCount > 1);
336                 processLine(outputStream, line);
337                 break;
338 
339             case AutoDetectHeaderResult.hasHeader:
340                 assert(_fileCount > 1);
341                 break;
342 
343             case AutoDetectHeaderResult.none:
344                 if (_fileCount == 1)
345                 {
346                     assert(_candidateHeaderLine.length == 0);
347                     _candidateHeaderLine = line.to!string;
348                 }
349                 else if (_fileCount == 2)
350                 {
351                     if (_candidateHeaderLine == line)
352                     {
353                         _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader;
354                         setHeaderLine(_candidateHeaderLine);
355 
356                         /* Edge case: First file has only a header line and look-ahead set to zero. */
357                         if (_stillCaching && _options.lookahead == 0) outputLookaheadCache(outputStream);
358                     }
359                     else
360                     {
361                         _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader;
362                         updateFieldFormatsForLine(_candidateHeaderLine);
363                         processLine(outputStream, line);
364                     }
365                 }
366                 break;
367             }
368         }
369     }
370 
371     void processLine(OutputRange!char outputStream, const char[] line)
372     {
373         if (_stillCaching) cacheDataLine(outputStream, line);
374         else outputDataLine(outputStream, line);
375     }
376 
377     void finish(OutputRange!char outputStream)
378     {
379         if (_stillCaching) outputLookaheadCache(outputStream);
380     }
381 
382 private:
383     /* outputLookaheadCache finalizes processing of the lookahead cache. This includes
384      * Setting the type and width of each field, finalizing the auto-detect header
385      * decision, and outputing all lines in the cache.
386      */
387     void outputLookaheadCache(OutputRange!char outputStream)
388     {
389         import std.algorithm : splitter;
390 
391         assert(_stillCaching);
392 
393         if (_options.autoDetectHeader &&
394             _autoDetectHeaderResult == AutoDetectHeaderResult.none &&
395             _candidateHeaderLine.length != 0)
396         {
397             if (candidateHeaderLooksLikeHeader())
398             {
399                 _autoDetectHeaderResult = AutoDetectHeaderResult.hasHeader;
400                 setHeaderLine(_candidateHeaderLine);
401             }
402             else
403             {
404                 _autoDetectHeaderResult = AutoDetectHeaderResult.noHeader;
405             }
406         }
407 
408 
409         if (_options.hasHeader ||
410             (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader))
411         {
412             finalizeFieldFormatting();
413             outputHeader(outputStream);
414         }
415         else if (_options.autoDetectHeader && _autoDetectHeaderResult == AutoDetectHeaderResult.noHeader &&
416                  _candidateHeaderLine.length != 0)
417         {
418             updateFieldFormatsForLine(_candidateHeaderLine);
419             finalizeFieldFormatting();
420             outputDataLine(outputStream, _candidateHeaderLine);
421         }
422         else
423         {
424             finalizeFieldFormatting();
425         }
426 
427         foreach(line; _lookaheadCache.data) outputDataLine(outputStream, line);
428         _lookaheadCache.clear;
429         _stillCaching = false;
430     }
431 
432     bool candidateHeaderLooksLikeHeader() @safe
433     {
434         import std.algorithm : splitter;
435 
436         /* The candidate header is declared as the header if the look-ahead cache has at least
437          * one numeric field that is text in the candidate header.
438          */
439         foreach(fieldIndex, fieldValue; _candidateHeaderLine.splitter(_options.delim).enumerate)
440         {
441             auto candidateFieldFormat = FieldFormat(fieldIndex);
442             candidateFieldFormat.updateForFieldValue(fieldValue, _options);
443             if (_fieldVector.length > fieldIndex &&
444                 candidateFieldFormat.fieldType == FieldType.text &&
445                 (_fieldVector[fieldIndex].fieldType == FieldType.integer ||
446                  _fieldVector[fieldIndex].fieldType == FieldType.floatingPoint ||
447                  _fieldVector[fieldIndex].fieldType == FieldType.exponent))
448             {
449                 return true;
450             }
451         }
452 
453         return false;
454     }
455 
456     void setHeaderLine(const char[] line) @safe
457     {
458         import std.algorithm : splitter;
459 
460         foreach(fieldIndex, header; line.splitter(_options.delim).enumerate)
461         {
462             if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex);
463             assert(_fieldVector.length > fieldIndex);
464             _fieldVector[fieldIndex].setHeader(header);
465         }
466     }
467 
468     void cacheDataLine(OutputRange!char outputStream, const char[] line)
469     {
470         import std.conv : to;
471 
472         assert(_lookaheadCache.data.length < _options.lookahead);
473 
474         _lookaheadCache ~= line.to!string;
475         updateFieldFormatsForLine(line);
476         if (_lookaheadCache.data.length == _options.lookahead) outputLookaheadCache(outputStream);
477     }
478 
479     void updateFieldFormatsForLine(const char[] line) @safe
480     {
481         import std.algorithm : splitter;
482 
483         foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate)
484         {
485             if (_fieldVector.length == fieldIndex) _fieldVector ~= FieldFormat(fieldIndex);
486             assert(_fieldVector.length > fieldIndex);
487             _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options);
488         }
489 
490     }
491 
492     void finalizeFieldFormatting() @safe pure @nogc nothrow
493     {
494         size_t nextFieldStart = 0;
495         foreach(ref field; _fieldVector)
496         {
497             nextFieldStart = field.finalizeFormatting(nextFieldStart, _options) + _options.spaceBetweenFields;
498         }
499     }
500 
501     void outputHeader(OutputRange!char outputStream)
502     {
503         size_t nextOutputPosition = 0;
504         foreach(fieldIndex, ref field; _fieldVector.enumerate)
505         {
506             size_t spacesNeeded = field.startPosition - nextOutputPosition;
507             put(outputStream, repeat(" ", spacesNeeded));
508             nextOutputPosition += spacesNeeded;
509             nextOutputPosition += field.writeHeader(outputStream, _options);
510         }
511         put(outputStream, '\n');
512 
513         if (_options.underlineHeader)
514         {
515             nextOutputPosition = 0;
516             foreach(fieldIndex, ref field; _fieldVector.enumerate)
517             {
518                 size_t spacesNeeded = field.startPosition - nextOutputPosition;
519                 put(outputStream, repeat(" ", spacesNeeded));
520                 nextOutputPosition += spacesNeeded;
521                 nextOutputPosition += field.writeHeader!(Yes.writeUnderline)(outputStream, _options);
522             }
523             put(outputStream, '\n');
524         }
525     }
526 
527     void outputDataLine(OutputRange!char outputStream, const char[] line)
528     {
529         import std.algorithm : splitter;
530 
531         /* Repeating header option. */
532         if (_options.repeatHeader != 0 && _dataLineOutputCount != 0 &&
533             (_options.hasHeader || (_options.autoDetectHeader &&
534                                     _autoDetectHeaderResult == AutoDetectHeaderResult.hasHeader)) &&
535             _dataLineOutputCount % _options.repeatHeader == 0)
536         {
537             put(outputStream, '\n');
538             outputHeader(outputStream);
539         }
540 
541         _dataLineOutputCount++;
542 
543         size_t nextOutputPosition = 0;
544         foreach(fieldIndex, fieldValue; line.splitter(_options.delim).enumerate)
545         {
546             if (fieldIndex == _fieldVector.length)
547             {
548                 /* Line is longer than any seen while caching. Add a new FieldFormat entry
549                  * and set the line formatting based on this field value.
550                  */
551                 _fieldVector ~= FieldFormat(fieldIndex);
552                 size_t startPosition = (fieldIndex == 0) ?
553                     0 :
554                     _fieldVector[fieldIndex - 1].endPosition + _options.spaceBetweenFields;
555 
556                 _fieldVector[fieldIndex].updateForFieldValue(fieldValue, _options);
557                 _fieldVector[fieldIndex].finalizeFormatting(startPosition, _options);
558             }
559 
560             assert(fieldIndex < _fieldVector.length);
561 
562             FieldFormat fieldFormat = _fieldVector[fieldIndex];
563             size_t nextFieldStart = fieldFormat.startPosition;
564             size_t spacesNeeded = (nextOutputPosition < nextFieldStart) ?
565                 nextFieldStart - nextOutputPosition :
566                 (fieldIndex == 0) ? 0 : 1;  // Previous field went long. One space between fields
567 
568             put(outputStream, repeat(" ", spacesNeeded));
569             nextOutputPosition += spacesNeeded;
570             nextOutputPosition += fieldFormat.writeFieldValue(outputStream, nextOutputPosition, fieldValue, _options);
571         }
572         put(outputStream, '\n');
573     }
574 }
575 
576 /** FieldFormat holds all the formatting info needed to format data values in a specific
577  * column. e.g. Field 1 may be text, field 2 may be a float, etc. This is calculated
578  * during the caching phase. Each FieldFormat instance is part of a vector representing
579  * the full row, so each includes the start position on the line and similar data.
580  *
581  * APIs used during the caching phase to gather field value samples
582  * - this - Initial construction. Takes the field index.
583  * - setHeader - Used to set the header text.
584  * - updateForFieldValue - Used to add the next field value sample.
585  * - finalizeFormatting - Used at the end of caching to finalize the format choices.
586  *
587  * APIs used after caching is finished (after finalizeFormatting):
588  * - startPosition - Returns the expected start position for the field.
589  * - endPosition - Returns the expected end position for the field.
590  * - writeHeader - Outputs the header, properly aligned.
591  * - writeFieldValue - Outputs the current field value, properly aligned.
592  */
593 
594 enum FieldType { unknown, text, integer, floatingPoint, exponent };
595 enum FieldAlignment { left, right };
596 
597 struct FieldFormat
598 {
599 private:
600     size_t _fieldIndex;                  // Zero-based index in the line
601     string _header = "";                 // Original field header
602     size_t _headerPrintWidth = 0;
603     FieldType _type = FieldType.unknown;
604     FieldAlignment _alignment = FieldAlignment.left;
605     size_t _startPosition = 0;
606     size_t _printWidth = 0;
607     size_t _precision = 0;          // Number of digits after the decimal point
608 
609     /* These are used while doing initial type and print format detection. */
610     size_t _minRawPrintWidth = 0;
611     size_t _maxRawPrintWidth = 0;
612     size_t _maxDigitsBeforeDecimal = 0;
613     size_t _maxDigitsAfterDecimal = 0;
614     size_t _maxSignificantDigits = 0;  // Digits to include in exponential notation
615 
616 public:
617     this(size_t fieldIndex) @safe pure nothrow @nogc
618     {
619         _fieldIndex = fieldIndex;
620     }
621 
622     /* setHeader is called to set the header text. */
623     void setHeader(const char[] header) @safe
624     {
625         import std.conv : to;
626 
627         _header = header.to!string;
628         _headerPrintWidth = _header.monospacePrintWidth;
629     }
630 
631     size_t startPosition() nothrow pure @safe @property
632     {
633         return _startPosition;
634     }
635 
636     size_t endPosition() nothrow pure @safe @property
637     {
638         return _startPosition + _printWidth;
639     }
640 
641     FieldType fieldType() nothrow pure @safe @property
642     {
643         return _type;
644     }
645 
646     /** writeHeader - Writes the field header or underline characters to the output stream.
647      *
648      * The current output position should have been written up to the field's start position,
649      * including any spaces between fields. Unlike data fields, there is no need to correct
650      * for previous fields that have run long. This routine does not output trailing spaces.
651      * This makes it simpler for lines to avoid unnecessary trailing spaces.
652      *
653      * Underlines can either be written the full width of the field or the just under the
654      * text of the header. At present this is a template parameter (compile-time).
655      *
656      * The print width of the output is returned.
657      */
658     size_t writeHeader (Flag!"writeUnderline" writeUnderline = No.writeUnderline,
659                         Flag!"fullWidthUnderline" fullWidthUnderline = No.fullWidthUnderline)
660         (OutputRange!char outputStream, in ref TsvPrettyOptions options)
661     {
662         import std.range : repeat;
663 
664         size_t positionsWritten = 0;
665         if (_headerPrintWidth > 0)
666         {
667             static if (writeUnderline)
668             {
669                 static if (fullWidthUnderline)
670                 {
671                     put(outputStream, repeat("-", _printWidth));
672                     positionsWritten += _printWidth;
673                 }
674                 else  // Underline beneath the header text only
675                 {
676                     if (_alignment == FieldAlignment.right)
677                     {
678                         put(outputStream, repeat(" ", _printWidth - _headerPrintWidth));
679                         positionsWritten += _printWidth - _headerPrintWidth;
680                     }
681                     put(outputStream, repeat("-", _headerPrintWidth));
682                     positionsWritten += _headerPrintWidth;
683                 }
684             }
685             else
686             {
687                 if (_alignment == FieldAlignment.right)
688                 {
689                     put(outputStream, repeat(" ", _printWidth - _headerPrintWidth));
690                     positionsWritten += _printWidth - _headerPrintWidth;
691                 }
692                 put(outputStream, _header);
693                 positionsWritten += _headerPrintWidth;
694             }
695         }
696         return positionsWritten;
697     }
698 
699     /* writeFieldValue writes the field value for the current column The caller needs
700      * to generate output at least to the column's start position, but can go beyond
701      * if previous fields have run long.
702      *
703      * The field value is aligned properly in the field. Either left aligned (text) or
704      * right aligned (numeric). Floating point fields are both right aligned and
705      * decimal point aligned. The number of bytes written is returned. Trailing spaces
706      * are not added, the caller must add any necessary trailing spaces prior to
707      * printing the next field.
708      */
709     size_t writeFieldValue(OutputRange!char outputStream, size_t currPosition,
710                            const char[] fieldValue, in ref TsvPrettyOptions options)
711     in
712     {
713         assert(currPosition >= _startPosition);   // Caller resposible for advancing to field start position.
714         assert(_type == FieldType.text || _type == FieldType.integer ||
715                _type == FieldType.floatingPoint || _type == FieldType.exponent);
716     }
717     body
718     {
719         import std.algorithm : find, max, min;
720         import std.conv : to, ConvException;
721         import std.format : format;
722 
723         /* Create the print version of the string. Either the raw value or a formatted
724          * version of a float.
725          */
726         string printValue;
727         if (!options.formatFloats || _type == FieldType.text || _type == FieldType.integer)
728         {
729             printValue = fieldValue.to!string;
730         }
731         else
732         {
733             assert(options.formatFloats);
734             assert(_type == FieldType.exponent || _type == FieldType.floatingPoint);
735 
736             if (_type == FieldType.exponent)
737             {
738                 printValue = fieldValue.formatExponentValue(_precision);
739             }
740             else
741             {
742                 printValue = fieldValue.formatFloatingPointValue(_precision);
743             }
744         }
745 
746         if (printValue.length == 0 && options.replaceEmpty) printValue = options.emptyReplacement;
747         size_t printValuePrintWidth = printValue.monospacePrintWidth;
748 
749         /* Calculate leading spaces needed for right alignment. */
750         size_t leadingSpaces = 0;
751         if (_alignment == FieldAlignment.right)
752         {
753             /* Target width adjusts the column width to account for overrun by the previous field. */
754             size_t targetWidth;
755             if (currPosition == _startPosition)
756             {
757                 targetWidth = _printWidth;
758             }
759             else
760             {
761                 size_t startGap = currPosition - _startPosition;
762                 targetWidth = max(printValuePrintWidth,
763                                   startGap < _printWidth ? _printWidth - startGap : 0);
764             }
765 
766             leadingSpaces = (printValuePrintWidth < targetWidth) ?
767                 targetWidth - printValuePrintWidth : 0;
768 
769             /* The above calculation assumes the print value is fully right aligned.
770              * This is not correct when raw value floats are being used rather than
771              * formatted floats, as different values will have different precision.
772              * The next adjustment accounts for this, dropping leading spaces as
773              * needed to align the decimal point. Note that text and exponential
774              * values get aligned strictly against right boundaries.
775              */
776             if (leadingSpaces > 0 && _precision > 0 &&
777                 _type == FieldType.floatingPoint && !options.formatFloats)
778             {
779                 import std.algorithm : canFind, findSplit;
780                 import std.string : isNumeric;
781 
782                 if (printValue.isNumeric && !printValue.canFind!(x => x == 'e' || x == 'E'))
783                 {
784                     size_t decimalAndDigitsLength = printValue.find(".").length;
785                     size_t trailingSpaces =
786                         (decimalAndDigitsLength == 0) ? _precision + 1 :
787                         (decimalAndDigitsLength > _precision) ? 0 :
788                         _precision + 1 - decimalAndDigitsLength;
789 
790                     leadingSpaces = (leadingSpaces > trailingSpaces) ?
791                         leadingSpaces - trailingSpaces : 0;
792                 }
793             }
794         }
795         put(outputStream, repeat(' ', leadingSpaces));
796         put(outputStream, printValue);
797         return printValuePrintWidth + leadingSpaces;
798     }
799 
800     /** updateForFieldValue updates type and format given a new field value.
801      *
802      * This is called during look-ahead caching to register a new sample value for the
803      * column. The key components updates are field type and print width.
804      */
805     void updateForFieldValue(const char[] fieldValue, in ref TsvPrettyOptions options) @safe
806     {
807         import std.algorithm : findAmong, findSplit, max, min;
808         import std.conv : to, ConvException;
809         import std.string : isNumeric;
810 
811         size_t fieldValuePrintWidth = fieldValue.monospacePrintWidth;
812         size_t fieldValuePrintWidthWithEmpty =
813             (fieldValuePrintWidth == 0 && options.replaceEmpty) ?
814             options.emptyReplacementPrintWidth :
815             fieldValuePrintWidth;
816 
817         _maxRawPrintWidth = max(_maxRawPrintWidth, fieldValuePrintWidthWithEmpty);
818         _minRawPrintWidth = (_minRawPrintWidth == 0) ?
819             fieldValuePrintWidthWithEmpty :
820             min(_minRawPrintWidth, fieldValuePrintWidthWithEmpty);
821 
822         if (_type == FieldType.text)
823         {
824             /* Already text, can't become anything else. */
825         }
826         else if (fieldValuePrintWidth == 0)
827         {
828             /* Don't let an empty field override a numeric field type. */
829         }
830         else if (!fieldValue.isNumeric)
831         {
832             /* Not parsable as a number. Switch from unknown or numeric type to text. */
833             _type = FieldType.text;
834         }
835         else
836         {
837             /* Field type is currently unknown or numeric, and current field parses as numeric.
838              * See if it parses as integer or float. Integers will parse as floats, so try
839              * integer types first.
840              */
841             FieldType parsesAs = FieldType.unknown;
842             long longValue;
843             ulong ulongValue;
844             double doubleValue;
845             try
846             {
847                 longValue = fieldValue.to!long;
848                 parsesAs = FieldType.integer;
849             }
850             catch (ConvException)
851             {
852                 try
853                 {
854                     ulongValue = fieldValue.to!ulong;
855                     parsesAs = FieldType.integer;
856                 }
857                 catch (ConvException)
858                 {
859                     try
860                     {
861                         doubleValue = fieldValue.to!double;
862                         import std.algorithm : findAmong;
863                         parsesAs = (fieldValue.findAmong("eE").length == 0) ?
864                             FieldType.floatingPoint : FieldType.exponent;
865                     }
866                     catch (ConvException)
867                     {
868                         /* Note: This means isNumeric thinks it's a number, but conversions all failed. */
869                         parsesAs = FieldType.text;
870                     }
871                 }
872             }
873 
874             if (parsesAs == FieldType.text)
875             {
876                 /* Not parsable as a number (despite isNumeric result). Switch to text type. */
877                 _type = FieldType.text;
878             }
879             else if (parsesAs == FieldType.exponent)
880             {
881                 /* Exponential notion supersedes both vanilla floats and integers. */
882                 _type = FieldType.exponent;
883                 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits);
884 
885                 if (auto decimalSplit = fieldValue.findSplit("."))
886                 {
887                     auto fromExponent = decimalSplit[2].findAmong("eE");
888                     size_t numDigitsAfterDecimal = decimalSplit[2].length - fromExponent.length;
889                     _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length);
890                     _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, numDigitsAfterDecimal);
891                 }
892                 else
893                 {
894                     /* Exponent without a decimal point. */
895                     auto fromExponent = fieldValue.findAmong("eE");
896                     assert(fromExponent.length > 0);
897                     size_t numDigits = fieldValue.length - fromExponent.length;
898                     _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, numDigits);
899                 }
900             }
901             else if (parsesAs == FieldType.floatingPoint)
902             {
903                 /* Floating point supercedes integer but not exponential. */
904                 if (_type != FieldType.exponent) _type = FieldType.floatingPoint;
905                 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits);
906 
907                 if (auto decimalSplit = fieldValue.findSplit("."))
908                 {
909                     _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, decimalSplit[0].length);
910                     _maxDigitsAfterDecimal = max(_maxDigitsAfterDecimal, decimalSplit[2].length);
911                 }
912             }
913             else
914             {
915                 assert(parsesAs == FieldType.integer);
916                 if (_type != FieldType.floatingPoint) _type = FieldType.integer;
917                 _maxSignificantDigits = max(_maxSignificantDigits, fieldValue.significantDigits);
918                 _maxDigitsBeforeDecimal = max(_maxDigitsBeforeDecimal, fieldValue.length);
919             }
920         }
921     }
922 
923     /* finalizeFormatting updates field formatting info based on the current state. It is
924      * expected to be called after adding field entries via updateForFieldValue(). It
925      * returns its new end position.
926      */
927     size_t finalizeFormatting (size_t startPosition, in ref TsvPrettyOptions options) @safe pure @nogc nothrow
928     {
929         import std.algorithm : max, min;
930         _startPosition = startPosition;
931         if (_type == FieldType.unknown) _type = FieldType.text;
932         _alignment = (_type == FieldType.integer || _type == FieldType.floatingPoint
933                       || _type == FieldType.exponent) ?
934             FieldAlignment.right :
935             FieldAlignment.left;
936 
937         if (_type == FieldType.floatingPoint)
938         {
939             size_t precision = min(options.floatPrecision, _maxDigitsAfterDecimal);
940             size_t maxValueWidth = _maxDigitsBeforeDecimal + precision;
941             if (precision > 0) maxValueWidth++;  // Account for the decimal point.
942             _printWidth = max(1, _headerPrintWidth, maxValueWidth);
943             _precision = precision;
944         }
945         else if (_type == FieldType.exponent)
946         {
947             size_t maxPrecision = (_maxSignificantDigits > 0) ? _maxSignificantDigits - 1 : 0;
948             _precision = min(options.floatPrecision, maxPrecision);
949 
950             size_t maxValuePrintWidth = !options.formatFloats ? _maxRawPrintWidth : _precision + 7;
951             _printWidth = max(1, _headerPrintWidth, maxValuePrintWidth);
952         }
953         else if (_type == FieldType.integer)
954         {
955             _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth, _maxRawPrintWidth);
956             _precision = 0;
957         }
958         else
959         {
960             _printWidth = max(1, _headerPrintWidth, _minRawPrintWidth,
961                               min(options.maxFieldPrintWidth, _maxRawPrintWidth));
962             _precision = 0;
963         }
964 
965         return _startPosition + _printWidth;
966     }
967 }
968 
969 /** formatFloatingPointValue - Returns the printed representation of a raw value
970  * formatted as a fixed precision floating number. This includes zero padding or
971  * truncation of trailing digits as necessary to meet the desired precision.
972  *
973  * If the value cannot be interpreted as a double then the raw value is returned.
974  * Similarly, values in exponential notion are returned without reformatting.
975  *
976  * This routine is used to format values in columns identified as floating point.
977  */
978 string formatFloatingPointValue(const char[] value, size_t precision) @safe
979 {
980     import std.algorithm : canFind, find;
981     import std.array : join;
982     import std.conv : to, ConvException;
983     import std.format : format;
984     import std.math : isFinite;
985     import std.range : repeat;
986 
987     string printValue;
988 
989     if (value.canFind!(x => x == 'e' || x == 'E'))
990     {
991         /* Exponential notion. Use the raw value. */
992         printValue = value.to!string;
993     }
994     else
995     {
996         try
997         {
998             double doubleValue = value.to!double;
999             if (doubleValue.isFinite)
1000             {
1001                 size_t numPrecisionDigits = value.precisionDigits;
1002                 if (numPrecisionDigits >= precision)
1003                 {
1004                     printValue = format("%.*f", precision, doubleValue);
1005                 }
1006                 else if (numPrecisionDigits == 0)
1007                 {
1008                     printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ "." ~ repeat("0", precision).join;
1009                 }
1010                 else
1011                 {
1012                     printValue = format("%.*f", numPrecisionDigits, doubleValue) ~ repeat("0", precision - numPrecisionDigits).join;
1013                 }
1014             }
1015             else printValue = value.to!string;  // NaN or Infinity
1016         }
1017         catch (ConvException) printValue = value.to!string;
1018     }
1019     return printValue;
1020 }
1021 
1022 @safe unittest
1023 {
1024     assert("".formatFloatingPointValue(3) == "");
1025     assert(" ".formatFloatingPointValue(3) == " ");
1026     assert("abc".formatFloatingPointValue(3) == "abc");
1027     assert("nan".formatFloatingPointValue(3) == "nan");
1028     assert("0".formatFloatingPointValue(0) == "0");
1029     assert("1".formatFloatingPointValue(0) == "1");
1030     assert("1.".formatFloatingPointValue(0) == "1");
1031     assert("1".formatFloatingPointValue(3) == "1.000");
1032     assert("1000".formatFloatingPointValue(3) == "1000.000");
1033     assert("1000.001".formatFloatingPointValue(5) == "1000.00100");
1034     assert("1000.001".formatFloatingPointValue(3) == "1000.001");
1035     assert("1000.001".formatFloatingPointValue(2) == "1000.00");
1036     assert("1000.006".formatFloatingPointValue(2) == "1000.01");
1037     assert("-0.1".formatFloatingPointValue(1) == "-0.1");
1038     assert("-0.1".formatFloatingPointValue(3) == "-0.100");
1039     assert("-0.001".formatFloatingPointValue(3) == "-0.001");
1040     assert("-0.006".formatFloatingPointValue(2) == "-0.01");
1041     assert("-0.001".formatFloatingPointValue(1) == "-0.0");
1042     assert("-0.001".formatFloatingPointValue(0) == "-0");
1043     assert("0e+00".formatFloatingPointValue(0) == "0e+00");
1044     assert("0.00e+00".formatFloatingPointValue(0) == "0.00e+00");
1045     assert("1e+06".formatFloatingPointValue(1) == "1e+06");
1046     assert("1e+06".formatFloatingPointValue(2) == "1e+06");
1047     assert("1E-06".formatFloatingPointValue(1) == "1E-06");
1048     assert("1.1E+6".formatFloatingPointValue(2) == "1.1E+6");
1049     assert("1.1E+100".formatFloatingPointValue(2) == "1.1E+100");
1050 }
1051 
1052 /** formatExponentValue - Returns the printed representation of a raw value formatted
1053  * using exponential notation and a specific precision. If the value cannot be interpreted
1054  * as a double then the a copy of the original value is returned.
1055  *
1056  * This routine is used to format values in columns identified as having exponent format.
1057  */
1058 string formatExponentValue(const char[] value, size_t precision) @safe
1059 {
1060     import std.algorithm : canFind, find, findSplit;
1061     import std.array : join;
1062     import std.conv : to, ConvException;
1063     import std.format : format;
1064     import std.math : isFinite;
1065     import std.range : repeat;
1066 
1067     string printValue;
1068     try
1069     {
1070         double doubleValue = value.to!double;
1071         if (doubleValue.isFinite)
1072         {
1073             size_t numSignificantDigits = value.significantDigits;
1074             size_t numPrecisionDigits = (numSignificantDigits == 0) ? 0 : numSignificantDigits - 1;
1075             if (numPrecisionDigits >= precision)
1076             {
1077                 printValue = format("%.*e", precision, doubleValue);
1078             }
1079             else
1080             {
1081                 string unpaddedPrintValue = format("%.*e", numPrecisionDigits, doubleValue);
1082                 auto exponentSplit = unpaddedPrintValue.findSplit("e");   // Uses the same exponent case as format call.
1083                 if (numPrecisionDigits == 0)
1084                 {
1085                     assert(precision != 0);
1086                     assert(!exponentSplit[0].canFind("."));
1087                     printValue = exponentSplit[0] ~ "." ~ repeat("0", precision).join ~ exponentSplit[1] ~ exponentSplit[2];
1088                 }
1089                 else
1090                 {
1091                     printValue = exponentSplit[0] ~ repeat("0", precision - numPrecisionDigits).join ~ exponentSplit[1] ~ exponentSplit[2];
1092                 }
1093             }
1094         }
1095         else printValue = value.to!string;  // NaN or Infinity
1096     }
1097     catch (ConvException) printValue = value.to!string;
1098 
1099     return printValue;
1100 }
1101 
1102 @safe unittest
1103 {
1104     assert("".formatExponentValue(3) == "");
1105     assert(" ".formatExponentValue(3) == " ");
1106     assert("abc".formatExponentValue(3) == "abc");
1107     assert("nan".formatExponentValue(3) == "nan");
1108     assert("0".formatExponentValue(0) == "0e+00");
1109     assert("1".formatExponentValue(0) == "1e+00");
1110     assert("1.".formatExponentValue(0) == "1e+00");
1111     assert("1".formatExponentValue(3) == "1.000e+00");
1112     assert("1000".formatExponentValue(3) == "1.000e+03");
1113     assert("1000.001".formatExponentValue(5) == "1.00000e+03");
1114     assert("1000.001".formatExponentValue(3) == "1.000e+03");
1115     assert("1000.001".formatExponentValue(6) == "1.000001e+03");
1116     assert("1000.006".formatExponentValue(5) == "1.00001e+03");
1117     assert("-0.1".formatExponentValue(1) == "-1.0e-01");
1118     assert("-0.1".formatExponentValue(3) == "-1.000e-01");
1119     assert("-0.001".formatExponentValue(3) == "-1.000e-03");
1120     assert("-0.001".formatExponentValue(1) == "-1.0e-03");
1121     assert("-0.001".formatExponentValue(0) == "-1e-03");
1122     assert("0e+00".formatExponentValue(0) == "0e+00");
1123     assert("0.00e+00".formatExponentValue(0) == "0e+00");
1124     assert("1e+06".formatExponentValue(1) == "1.0e+06");
1125     assert("1e+06".formatExponentValue(2) == "1.00e+06");
1126     assert("1.0001e+06".formatExponentValue(1) == "1.0e+06");
1127     assert("1.0001e+06".formatExponentValue(5) == "1.00010e+06");
1128 }
1129 
1130 /** significantDigits - Returns the number of significant digits in a numeric string.
1131  *
1132  * Significant digits are those needed to represent a number in exponential notation.
1133  * Examples:
1134  *   22.345 - 5 digits
1135  *   10.010 - 4 digits
1136  *   0.0032 - 2 digits
1137  */
1138 size_t significantDigits(const char[] numericString) @safe pure
1139 {
1140     import std.algorithm : canFind, find, findAmong, findSplit, stripRight;
1141     import std.ascii : isDigit;
1142     import std.math : isFinite;
1143     import std.string : isNumeric;
1144     import std.conv : to;
1145     assert (numericString.isNumeric);
1146 
1147     size_t significantDigits = 0;
1148     if (numericString.to!double.isFinite)
1149     {
1150         auto digitsPart = numericString.find!(x => x.isDigit && x != '0');
1151         auto exponentPart = digitsPart.findAmong("eE");
1152         digitsPart = digitsPart[0 .. $ - exponentPart.length];
1153 
1154         if (digitsPart.canFind('.'))
1155         {
1156             digitsPart = digitsPart.stripRight('0');
1157             significantDigits = digitsPart.length - 1;
1158         }
1159         else
1160         {
1161             significantDigits = digitsPart.length;
1162         }
1163 
1164         if (significantDigits == 0) significantDigits = 1;
1165     }
1166 
1167     return significantDigits;
1168 }
1169 
1170 @safe pure unittest
1171 {
1172     assert("0".significantDigits == 1);
1173     assert("10".significantDigits == 2);
1174     assert("0.0".significantDigits == 1);
1175     assert("-10.0".significantDigits == 2);
1176     assert("-.01".significantDigits == 1);
1177     assert("-.5401".significantDigits == 4);
1178     assert("1010.010".significantDigits == 6);
1179     assert("0.0003003".significantDigits == 4);
1180     assert("6e+06".significantDigits == 1);
1181     assert("6.0e+06".significantDigits == 1);
1182     assert("6.5e+06".significantDigits == 2);
1183     assert("6.005e+06".significantDigits == 4);
1184 }
1185 
1186 /* precisionDigits - Returns the number of digits to the right of the decimal point in
1187  * a numeric string. This routine includes trailing zeros in the count.
1188  */
1189 size_t precisionDigits(const char[] numericString) @safe pure
1190 {
1191     import std.algorithm : canFind, find, findAmong, findSplit, stripRight;
1192     import std.ascii : isDigit;
1193     import std.math : isFinite;
1194     import std.string : isNumeric;
1195     import std.conv : to;
1196     assert (numericString.isNumeric);
1197 
1198     size_t precisionDigits = 0;
1199     if (numericString.to!double.isFinite)
1200     {
1201         if (auto decimalSplit = numericString.findSplit("."))
1202         {
1203             auto exponentPart = decimalSplit[2].findAmong("eE");
1204             precisionDigits = decimalSplit[2].length - exponentPart.length;
1205         }
1206     }
1207 
1208     return precisionDigits;
1209 }
1210 
1211 @safe pure unittest
1212 {
1213     assert("0".precisionDigits == 0);
1214     assert("10".precisionDigits == 0);
1215     assert("0.0".precisionDigits == 1);
1216     assert("-10.0".precisionDigits == 1);
1217     assert("-.01".precisionDigits == 2);
1218     assert("-.5401".precisionDigits == 4);
1219 }
1220 
1221 /** monospacePrintWidth - Calculates the expected print width of a string in monospace
1222  *  (fixed-width) fonts.
1223  */
1224 size_t monospacePrintWidth(const char[] str) @safe nothrow
1225 {
1226     bool isCJK(dchar c)
1227     {
1228         return c >= '\u3000' && c <= '\u9fff';
1229     }
1230 
1231     import std.uni : byGrapheme;
1232 
1233     size_t width = 0;
1234     try foreach (g; str.byGrapheme) width += isCJK(g[0]) ? 2 : 1;
1235     catch (Exception) width = str.length;  // Invalid utf-8 sequence. Catch avoids program failure.
1236 
1237     return width;
1238 }
1239 
1240 unittest
1241 {
1242     assert("".monospacePrintWidth == 0);
1243     assert(" ".monospacePrintWidth == 1);
1244     assert("abc".monospacePrintWidth == 3);
1245     assert("林檎".monospacePrintWidth == 4);
1246     assert("æble".monospacePrintWidth == 4);
1247     assert("ვაშლი".monospacePrintWidth == 5);
1248     assert("größten".monospacePrintWidth == 7);
1249 }