1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_append;
11 
12 import std.conv : to;
13 import std.exception : enforce;
14 import std.range;
15 import std.stdio;
16 import std.typecons : tuple;
17 
18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
19 
20 version(unittest)
21 {
22     // When running unit tests, use main from -main compiler switch.
23 }
24 else
25 {
26     /** Main program. Invokes command line arg processing and tsv-append to perform
27      * the real work. Any errors are caught and reported.
28      */
29     int main(string[] cmdArgs)
30     {
31         import tsv_utils.common.utils : BufferedOutputRange;
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvAppendOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
43         catch (Exception exc)
44         {
45             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
46             return 1;
47         }
48         return 0;
49     }
50 }
51 
52 auto helpTextVerbose = q"EOS
53 Synopsis: tsv-append [options] [file...]
54 
55 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
56 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
57 the first file. It also supports source tracking, adding a column indicating
58 the original file to each row. Results are written to standard output.
59 
60 Concatenation with header support is useful when preparing data for traditional
61 Unix utilities like 'sort' and 'sed' or applications that read a single file.
62 
63 Source tracking is useful when creating long/narrow form tabular data, a format
64 used by many statistics and data mining packages. In this scenario, files have
65 been used to capture related data sets, the difference between data sets being a
66 condition represented by the file. For example, results from different variants
67 of an experiment might each be recorded in their own files. Retaining the source
68 file as an output column preserves the condition represented by the file.
69 
70 The file-name (without extension) is used as the source value. This can
71 customized using the --f|file option.
72 
73 Example: Header processing:
74 
75    $ tsv-append -H file1.tsv file2.tsv file3.tsv
76 
77 Example: Header processing and source tracking:
78 
79    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
80 
81 Example: Source tracking with custom values:
82 
83    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
84 
85 Options:
86 EOS";
87 
88 auto helpText = q"EOS
89 Synopsis: tsv-append [options] [file...]
90 
91 tsv-append concatenates multiple TSV files, reading from files or standard input
92 and writing to standard output. It is header aware ('--H|header'), writing the
93 header from only the first file. It also supports source tracking, adding an
94 indicator of original file to each row of input.
95 
96 Options:
97 EOS";
98 
99 /** Container for command line options.
100 */
101 struct TsvAppendOptions
102 {
103     string programName;
104     string[] files;                    // Input files
105     string[string] fileSourceNames;    // Maps file path to the 'source' value
106     bool helpVerbose = false;          // --help-verbose
107     string sourceHeader;               // --s|source-header
108     bool trackSource = false;          // --t|track-source
109     bool hasHeader = false;            // --H|header
110     char delim = '\t';                 // --d|delimiter
111     bool versionWanted = false;        // --V|version
112 
113     /* fileOptionHandler processes the '--f|file source=file' option. */
114     private void fileOptionHandler(string option, string optionVal) pure @safe
115     {
116         import std.algorithm : findSplit;
117         import std.format : format;
118 
119         auto valSplit = findSplit(optionVal, "=");
120 
121         enforce(!valSplit[0].empty && !valSplit[2].empty,
122                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
123                        option, optionVal, option));
124 
125         auto source = valSplit[0];
126         auto filepath = valSplit[2];
127         files ~= filepath;
128         fileSourceNames[filepath] = source;
129     }
130 
131     /** Command line argument processing.
132      *
133      * Returns a tuple. First value is true if command line arguments were successfully
134      * processed and execution should continue, or false if an error occurred or the user
135      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
136      *
137      * Returning true (execution continues) means args have been validated and derived
138      * values calculated. In addition, field indices have been converted to zero-based.
139      * If the whole line is the key, the individual fields list will be cleared.
140      */
141     auto processArgs (ref string[] cmdArgs)
142     {
143         import std.algorithm : any, each;
144         import std.getopt;
145         import std.path : baseName, stripExtension;
146 
147         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
148 
149         try
150         {
151             arraySep = ",";    // Use comma to separate values in command line options
152             auto r = getopt(
153                 cmdArgs,
154                 "help-verbose",    "          Print full help.", &helpVerbose,
155                 std.getopt.config.caseSensitive,
156                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
157                 std.getopt.config.caseInsensitive,
158                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
159                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
160                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
161                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
162                 std.getopt.config.caseSensitive,
163                 "V|version",       "          Print version information and exit.", &versionWanted,
164                 std.getopt.config.caseInsensitive,
165                 );
166 
167             if (r.helpWanted)
168             {
169                 defaultGetoptPrinter(helpText, r.options);
170                 return tuple(false, 0);
171             }
172             else if (helpVerbose)
173             {
174                 defaultGetoptPrinter(helpTextVerbose, r.options);
175                 return tuple(false, 0);
176             }
177             else if (versionWanted)
178             {
179                 import tsv_utils.common.tsvutils_version;
180                 writeln(tsvutilsVersionNotice("tsv-append"));
181                 return tuple(false, 0);
182             }
183 
184             /* Derivations and consistency checks. */
185             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
186             if (!sourceHeader.empty) hasHeader = true;
187             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
188 
189             /* Assume the remaing arguments are filepaths. */
190             foreach (fp; cmdArgs[1 .. $])
191             {
192                 import std.path : baseName, stripExtension;
193                 files ~= fp;
194                 fileSourceNames[fp] = fp.stripExtension.baseName;
195             }
196 
197             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
198             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
199         }
200         catch (Exception exc)
201         {
202             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
203             return tuple(false, 1);
204         }
205         return tuple(true, 0);
206     }
207 }
208 
209 /** tsvAppend implements the basic functionality of the tsv-append program.
210  */
211 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
212 if (isOutputRange!(OutputRange, char))
213 {
214     import tsv_utils.common.utils : bufferedByLine;
215 
216     bool headerWritten = false;
217     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
218     {
219         auto inputStream = (filename == "-") ? stdin : filename.File();
220         auto sourceName = cmdopt.fileSourceNames[filename];
221         foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1))
222         {
223             if (cmdopt.hasHeader && fileLineNum == 1)
224             {
225                 if (!headerWritten)
226                 {
227                     if (cmdopt.trackSource)
228                     {
229                         outputStream.put(cmdopt.sourceHeader);
230                         outputStream.put(cmdopt.delim);
231                     }
232                     outputStream.put(line);
233                     outputStream.put('\n');
234                     headerWritten = true;
235                 }
236             }
237             else
238             {
239                 if (cmdopt.trackSource)
240                 {
241                     outputStream.put(sourceName);
242                     outputStream.put(cmdopt.delim);
243                 }
244                 outputStream.put(line);
245                 outputStream.put('\n');
246             }
247         }
248     }
249 }
250 
251 version(unittest)
252 {
253     /* Unit test helper functions. */
254 
255     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
256 
257     void testTsvAppend(string[] cmdArgs, string[][] expected)
258     {
259         import std.array : appender;
260         import std.format : format;
261 
262         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
263 
264         auto formatAssertMessage(T...)(string msg, T formatArgs)
265         {
266             auto formatString = "[testTsvAppend] %s: " ~ msg;
267             return format(formatString, cmdArgs[0], formatArgs);
268         }
269 
270         TsvAppendOptions cmdopt;
271         auto savedCmdArgs = cmdArgs.to!string;
272         auto r = cmdopt.processArgs(cmdArgs);
273         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
274 
275         auto output = appender!(char[])();
276         tsvAppend(cmdopt, output);
277         auto expectedOutput = expected.tsvDataToString;
278 
279         assert(output.data == expectedOutput,
280                formatAssertMessage(
281                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
282                    expectedOutput.to!string, output.data.to!string));
283     }
284  }
285 
286 unittest
287 {
288     import std.path : buildPath;
289     import std.file : rmdirRecurse;
290     import std.format : format;
291 
292     auto testDir = makeUnittestTempDir("tsv_append");
293     scope(exit) testDir.rmdirRecurse;
294 
295     string[][] data1 =
296         [["field_a", "field_b", "field_c"],
297          ["red", "17", "κόκκινος"],
298          ["blue", "12", "άσπρο"]];
299 
300     string[][] data2 =
301         [["field_a", "field_b", "field_c"],
302          ["green", "13.5", "κόκκινος"],
303          ["blue", "15", "πράσινος"]];
304 
305     string[][] data3 =
306         [["field_a", "field_b", "field_c"],
307          ["yellow", "9", "κίτρινος"]];
308 
309     string[][] dataHeaderRowOnly =
310         [["field_a", "field_b", "field_c"]];
311 
312     string[][] dataEmpty = [[]];
313 
314     string filepath1 = buildPath(testDir, "file1.tsv");
315     string filepath2 = buildPath(testDir, "file2.tsv");
316     string filepath3 = buildPath(testDir, "file3.tsv");
317     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
318     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
319 
320     writeUnittestTsvFile(filepath1, data1);
321     writeUnittestTsvFile(filepath2, data2);
322     writeUnittestTsvFile(filepath3, data3);
323     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
324     writeUnittestTsvFile(filepathEmpty, dataEmpty);
325 
326     testTsvAppend(["test-1", filepath1], data1);
327     testTsvAppend(["test-2", "--header", filepath1], data1);
328     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
329 
330     testTsvAppend(["test-4", "--header", filepath1, filepath2],
331                   [["field_a", "field_b", "field_c"],
332                    ["red", "17", "κόκκινος"],
333                    ["blue", "12", "άσπρο"],
334                    ["green", "13.5", "κόκκινος"],
335                    ["blue", "15", "πράσινος"]]);
336 
337     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
338                   [["field_a", "field_b", "field_c"],
339                    ["red", "17", "κόκκινος"],
340                    ["blue", "12", "άσπρο"],
341                    ["green", "13.5", "κόκκινος"],
342                    ["blue", "15", "πράσινος"],
343                    ["yellow", "9", "κίτρινος"]]);
344 
345     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
346                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
347 
348     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
349                   [["field_a", "field_b", "field_c"],
350                    ["red", "17", "κόκκινος"],
351                    ["blue", "12", "άσπρο"],
352                    ["green", "13.5", "κόκκινος"],
353                    ["blue", "15", "πράσινος"],
354                    ["yellow", "9", "κίτρινος"]]);
355 
356     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
357                   [["file1", "field_a", "field_b", "field_c"],
358                    ["file1", "red", "17", "κόκκινος"],
359                    ["file1", "blue", "12", "άσπρο"],
360                    ["file2", "field_a", "field_b", "field_c"],
361                    ["file2", "green", "13.5", "κόκκινος"],
362                    ["file2", "blue", "15", "πράσινος"]]);
363 
364     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
365                   [["file", "field_a", "field_b", "field_c"],
366                    ["file1", "red", "17", "κόκκινος"],
367                    ["file1", "blue", "12", "άσπρο"],
368                    ["file2", "green", "13.5", "κόκκινος"],
369                    ["file2", "blue", "15", "πράσινος"]]);
370 
371     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
372                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
373                   [["source", "field_a", "field_b", "field_c"],
374                    ["file1", "red", "17", "κόκκινος"],
375                    ["file1", "blue", "12", "άσπρο"],
376                    ["file2", "green", "13.5", "κόκκινος"],
377                    ["file2", "blue", "15", "πράσινος"],
378                    ["file3", "yellow", "9", "κίτρινος"]]);
379 
380     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
381                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
382                   [["id", "field_a", "field_b", "field_c"],
383                    ["1a", "red", "17", "κόκκινος"],
384                    ["1a", "blue", "12", "άσπρο"],
385                    ["1b", "green", "13.5", "κόκκινος"],
386                    ["1b", "blue", "15", "πράσινος"],
387                    ["1c", "yellow", "9", "κίτρινος"]]);
388 
389     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
390                    "-f", format("1b=%s", filepath2), filepath3],
391                   [["id", "field_a", "field_b", "field_c"],
392                    ["1a", "red", "17", "κόκκινος"],
393                    ["1a", "blue", "12", "άσπρο"],
394                    ["1b", "green", "13.5", "κόκκινος"],
395                    ["1b", "blue", "15", "πράσινος"],
396                    ["file3", "yellow", "9", "κίτρινος"]]);
397 }