1 /**
2 Command line tool that appends multiple TSV files. It is header aware and supports
3 tracking the original source file of each row.
4 
5 Copyright (c) 2017-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_append;
11 
12 import std.conv : to;
13 import std.range;
14 import std.stdio;
15 import std.typecons : tuple;
16 
17 version(unittest)
18 {
19     // When running unit tests, use main from -main compiler switch.
20 }
21 else
22 {
23     int main(string[] cmdArgs)
24     {
25         import tsvutil : BufferedOutputRange;
26         /* When running in DMD code coverage mode, turn on report merging. */
27         version(D_Coverage) version(DigitalMars)
28         {
29             import core.runtime : dmd_coverSetMerge;
30             dmd_coverSetMerge(true);
31         }
32 
33         TsvAppendOptions cmdopt;
34         auto r = cmdopt.processArgs(cmdArgs);
35         if (!r[0]) return r[1];
36         try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout));
37         catch (Exception exc)
38         {
39             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
40             return 1;
41         }
42         return 0;
43     }
44 }
45 
46 auto helpTextVerbose = q"EOS
47 Synopsis: tsv-append [options] [file...]
48 
49 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility.
50 Unlike 'cat', it is header aware ('--H|header'), writing the header from only
51 the first file. It also supports source tracking, adding a column indicating
52 the original file to each row. Results are written to standard output.
53 
54 Concatenation with header support is useful when preparing data for traditional
55 Unix utilities like 'sort' and 'sed' or applications that read a single file.
56 
57 Source tracking is useful when creating long/narrow form tabular data, a format
58 used by many statistics and data mining packages. In this scenario, files have
59 been used to capture related data sets, the difference between data sets being a
60 condition represented by the file. For example, results from different variants
61 of an experiment might each be recorded in their own files. Retaining the source
62 file as an output column preserves the condition represented by the file.
63 
64 The file-name (without extension) is used as the source value. This can
65 customized using the --f|file option.
66 
67 Example: Header processing:
68 
69    $ tsv-append -H file1.tsv file2.tsv file3.tsv
70 
71 Example: Header processing and source tracking:
72 
73    $ tsv-append -H -t file1.tsv file2.tsv file3.tsv
74 
75 Example: Source tracking with custom values:
76 
77    $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv
78 
79 Options:
80 EOS";
81 
82 auto helpText = q"EOS
83 Synopsis: tsv-append [options] [file...]
84 
85 tsv-append concatenates multiple TSV files, reading from files or standard input
86 and writing to standard output. It is header aware ('--H|header'), writing the
87 header from only the first file. It also supports source tracking, adding an
88 indicator of original file to each row of input.
89 
90 Options:
91 EOS";
92 
93 struct TsvAppendOptions
94 {
95     string programName;
96     string[] files;                    // Input files
97     string[string] fileSourceNames;    // Maps file path to the 'source' value
98     bool helpVerbose = false;          // --help-verbose
99     string sourceHeader;               // --s|source-header
100     bool trackSource = false;          // --t|track-source
101     bool hasHeader = false;            // --H|header
102     char delim = '\t';                 // --d|delimiter
103     bool versionWanted = false;        // --V|version
104 
105     /* fileOptionHandler processes the '--f|file source=file' option. */
106     private void fileOptionHandler(string option, string optionVal)
107     {
108         import std.algorithm : findSplit;
109         import std.format : format;
110 
111         auto valSplit = findSplit(optionVal, "=");
112         if (valSplit[0].empty || valSplit[2].empty)
113             throw new Exception(
114                 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.",
115                        option, optionVal, option));
116 
117         auto source = valSplit[0];
118         auto filepath = valSplit[2];
119         files ~= filepath;
120         fileSourceNames[filepath] = source;
121     }
122 
123     /* Returns a tuple. First value is true if command line arguments were successfully
124      * processed and execution should continue, or false if an error occurred or the user
125      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
126      *
127      * Returning true (execution continues) means args have been validated and derived
128      * values calculated. In addition, field indices have been converted to zero-based.
129      * If the whole line is the key, the individual fields list will be cleared.
130      */
131     auto processArgs (ref string[] cmdArgs)
132     {
133         import std.algorithm : any, each;
134         import std.getopt;
135         import std.path : baseName, stripExtension;
136 
137         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
138 
139         try
140         {
141             arraySep = ",";    // Use comma to separate values in command line options
142             auto r = getopt(
143                 cmdArgs,
144                 "help-verbose",    "          Print full help.", &helpVerbose,
145                 std.getopt.config.caseSensitive,
146                 "H|header",        "          Treat the first line of each file as a header.", &hasHeader,
147                 std.getopt.config.caseInsensitive,
148                 "t|track-source",  "          Track the source file. Adds an column with the source name.", &trackSource,
149                 "s|source-header", "STR       Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader,
150                 "f|file",          "STR=FILE  Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler,
151                 "d|delimiter",     "CHR       Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
152                 std.getopt.config.caseSensitive,
153                 "V|version",       "          Print version information and exit.", &versionWanted,
154                 std.getopt.config.caseInsensitive,
155                 );
156 
157             if (r.helpWanted)
158             {
159                 defaultGetoptPrinter(helpText, r.options);
160                 return tuple(false, 0);
161             }
162             else if (helpVerbose)
163             {
164                 defaultGetoptPrinter(helpTextVerbose, r.options);
165                 return tuple(false, 0);
166             }
167             else if (versionWanted)
168             {
169                 import tsvutils_version;
170                 writeln(tsvutilsVersionNotice("tsv-append"));
171                 return tuple(false, 0);
172             }
173 
174             /* Derivations and consistency checks. */
175             if (files.length > 0 || !sourceHeader.empty) trackSource = true;
176             if (!sourceHeader.empty) hasHeader = true;
177             if (hasHeader && sourceHeader.empty) sourceHeader = "file";
178 
179             /* Assume the remaing arguments are filepaths. */
180             foreach (fp; cmdArgs[1 .. $])
181             {
182                 import std.path : baseName, stripExtension;
183                 files ~= fp;
184                 fileSourceNames[fp] = fp.stripExtension.baseName;
185             }
186 
187             /* Add a name mapping for dash ('-') unless it was included in the --file option. */
188             if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin";
189         }
190         catch (Exception exc)
191         {
192             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
193             return tuple(false, 1);
194         }
195         return tuple(true, 0);
196     }
197 }
198 
199 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, OutputRange outputStream)
200     if (isOutputRange!(OutputRange, char))
201 {
202     bool headerWritten = false;
203     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
204     {
205         auto inputStream = (filename == "-") ? stdin : filename.File();
206         auto sourceName = cmdopt.fileSourceNames[filename];
207         foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1))
208         {
209             if (cmdopt.hasHeader && fileLineNum == 1)
210             {
211                 if (!headerWritten)
212                 {
213                     if (cmdopt.trackSource)
214                     {
215                         outputStream.put(cmdopt.sourceHeader);
216                         outputStream.put(cmdopt.delim);
217                     }
218                     outputStream.put(line);
219                     outputStream.put('\n');
220                     headerWritten = true;
221                 }
222             }
223             else
224             {
225                 if (cmdopt.trackSource)
226                 {
227                     outputStream.put(sourceName);
228                     outputStream.put(cmdopt.delim);
229                 }
230                 outputStream.put(line);
231                 outputStream.put('\n');
232             }
233         }
234     }
235 }
236 
237 version(unittest)
238 {
239     /* Unit test helper functions. */
240 
241     import unittest_utils;   // tsv unit test helpers, from common/src/.
242 
243     void testTsvAppend(string[] cmdArgs, string[][] expected)
244     {
245         import std.array : appender;
246         import std.format : format;
247 
248         assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty.");
249 
250         auto formatAssertMessage(T...)(string msg, T formatArgs)
251         {
252             auto formatString = "[testTsvAppend] %s: " ~ msg;
253             return format(formatString, cmdArgs[0], formatArgs);
254         }
255 
256         TsvAppendOptions cmdopt;
257         auto savedCmdArgs = cmdArgs.to!string;
258         auto r = cmdopt.processArgs(cmdArgs);
259         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
260 
261         auto output = appender!(char[])();
262         tsvAppend(cmdopt, output);
263         auto expectedOutput = expected.tsvDataToString;
264 
265         assert(output.data == expectedOutput,
266                formatAssertMessage(
267                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
268                    expectedOutput.to!string, output.data.to!string));
269     }
270  }
271 
272 unittest
273 {
274     import std.path : buildPath;
275     import std.file : rmdirRecurse;
276     import std.format : format;
277 
278     auto testDir = makeUnittestTempDir("tsv_append");
279     scope(exit) testDir.rmdirRecurse;
280 
281     string[][] data1 =
282         [["field_a", "field_b", "field_c"],
283          ["red", "17", "κόκκινος"],
284          ["blue", "12", "άσπρο"]];
285 
286     string[][] data2 =
287         [["field_a", "field_b", "field_c"],
288          ["green", "13.5", "κόκκινος"],
289          ["blue", "15", "πράσινος"]];
290 
291     string[][] data3 =
292         [["field_a", "field_b", "field_c"],
293          ["yellow", "9", "κίτρινος"]];
294 
295     string[][] dataHeaderRowOnly =
296         [["field_a", "field_b", "field_c"]];
297 
298     string[][] dataEmpty = [[]];
299 
300     string filepath1 = buildPath(testDir, "file1.tsv");
301     string filepath2 = buildPath(testDir, "file2.tsv");
302     string filepath3 = buildPath(testDir, "file3.tsv");
303     string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv");
304     string filepathEmpty = buildPath(testDir, "fileEmpty.tsv");
305 
306     writeUnittestTsvFile(filepath1, data1);
307     writeUnittestTsvFile(filepath2, data2);
308     writeUnittestTsvFile(filepath3, data3);
309     writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly);
310     writeUnittestTsvFile(filepathEmpty, dataEmpty);
311 
312     testTsvAppend(["test-1", filepath1], data1);
313     testTsvAppend(["test-2", "--header", filepath1], data1);
314     testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2);
315 
316     testTsvAppend(["test-4", "--header", filepath1, filepath2],
317                   [["field_a", "field_b", "field_c"],
318                    ["red", "17", "κόκκινος"],
319                    ["blue", "12", "άσπρο"],
320                    ["green", "13.5", "κόκκινος"],
321                    ["blue", "15", "πράσινος"]]);
322 
323     testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3],
324                   [["field_a", "field_b", "field_c"],
325                    ["red", "17", "κόκκινος"],
326                    ["blue", "12", "άσπρο"],
327                    ["green", "13.5", "κόκκινος"],
328                    ["blue", "15", "πράσινος"],
329                    ["yellow", "9", "κίτρινος"]]);
330 
331     testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
332                   data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3);
333 
334     testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
335                   [["field_a", "field_b", "field_c"],
336                    ["red", "17", "κόκκινος"],
337                    ["blue", "12", "άσπρο"],
338                    ["green", "13.5", "κόκκινος"],
339                    ["blue", "15", "πράσινος"],
340                    ["yellow", "9", "κίτρινος"]]);
341 
342     testTsvAppend(["test-8", "--track-source", filepath1, filepath2],
343                   [["file1", "field_a", "field_b", "field_c"],
344                    ["file1", "red", "17", "κόκκινος"],
345                    ["file1", "blue", "12", "άσπρο"],
346                    ["file2", "field_a", "field_b", "field_c"],
347                    ["file2", "green", "13.5", "κόκκινος"],
348                    ["file2", "blue", "15", "πράσινος"]]);
349 
350     testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2],
351                   [["file", "field_a", "field_b", "field_c"],
352                    ["file1", "red", "17", "κόκκινος"],
353                    ["file1", "blue", "12", "άσπρο"],
354                    ["file2", "green", "13.5", "κόκκινος"],
355                    ["file2", "blue", "15", "πράσινος"]]);
356 
357     testTsvAppend(["test-10", "-H", "-t", "--source-header", "source",
358                    filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3],
359                   [["source", "field_a", "field_b", "field_c"],
360                    ["file1", "red", "17", "κόκκινος"],
361                    ["file1", "blue", "12", "άσπρο"],
362                    ["file2", "green", "13.5", "κόκκινος"],
363                    ["file2", "blue", "15", "πράσινος"],
364                    ["file3", "yellow", "9", "κίτρινος"]]);
365 
366     testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1),
367                    "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)],
368                   [["id", "field_a", "field_b", "field_c"],
369                    ["1a", "red", "17", "κόκκινος"],
370                    ["1a", "blue", "12", "άσπρο"],
371                    ["1b", "green", "13.5", "κόκκινος"],
372                    ["1b", "blue", "15", "πράσινος"],
373                    ["1c", "yellow", "9", "κίτρινος"]]);
374 
375     testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1),
376                    "-f", format("1b=%s", filepath2), filepath3],
377                   [["id", "field_a", "field_b", "field_c"],
378                    ["1a", "red", "17", "κόκκινος"],
379                    ["1a", "blue", "12", "άσπρο"],
380                    ["1b", "green", "13.5", "κόκκινος"],
381                    ["1b", "blue", "15", "πράσινος"],
382                    ["file3", "yellow", "9", "κίτρινος"]]);
383 }