1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_append; 11 12 import std.conv : to; 13 import std.exception : enforce; 14 import std.range; 15 import std.stdio; 16 import std.typecons : tuple; 17 18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 19 20 version(unittest) 21 { 22 // When running unit tests, use main from -main compiler switch. 23 } 24 else 25 { 26 /** Main program. Invokes command line arg processing and tsv-append to perform 27 * the real work. Any errors are caught and reported. 28 */ 29 int main(string[] cmdArgs) 30 { 31 import tsv_utils.common.utils : BufferedOutputRange; 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvAppendOptions cmdopt; 40 auto r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); 43 catch (Exception exc) 44 { 45 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 46 return 1; 47 } 48 return 0; 49 } 50 } 51 52 auto helpTextVerbose = q"EOS 53 Synopsis: tsv-append [options] [file...] 54 55 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 56 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 57 the first file. It also supports source tracking, adding a column indicating 58 the original file to each row. Results are written to standard output. 59 60 Concatenation with header support is useful when preparing data for traditional 61 Unix utilities like 'sort' and 'sed' or applications that read a single file. 62 63 Source tracking is useful when creating long/narrow form tabular data, a format 64 used by many statistics and data mining packages. In this scenario, files have 65 been used to capture related data sets, the difference between data sets being a 66 condition represented by the file. For example, results from different variants 67 of an experiment might each be recorded in their own files. Retaining the source 68 file as an output column preserves the condition represented by the file. 69 70 The file-name (without extension) is used as the source value. This can 71 customized using the --f|file option. 72 73 Example: Header processing: 74 75 $ tsv-append -H file1.tsv file2.tsv file3.tsv 76 77 Example: Header processing and source tracking: 78 79 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 80 81 Example: Source tracking with custom values: 82 83 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 84 85 Options: 86 EOS"; 87 88 auto helpText = q"EOS 89 Synopsis: tsv-append [options] [file...] 90 91 tsv-append concatenates multiple TSV files, reading from files or standard input 92 and writing to standard output. It is header aware ('--H|header'), writing the 93 header from only the first file. It also supports source tracking, adding an 94 indicator of original file to each row of input. 95 96 Options: 97 EOS"; 98 99 /** Container for command line options. 100 */ 101 struct TsvAppendOptions 102 { 103 string programName; 104 string[] files; /// Input files 105 string[string] fileSourceNames; /// Maps file path to the 'source' value 106 string sourceHeader; /// --s|source-header 107 bool trackSource = false; /// --t|track-source 108 bool hasHeader = false; /// --H|header 109 char delim = '\t'; /// --d|delimiter 110 111 /* fileOptionHandler processes the '--f|file source=file' option. */ 112 private void fileOptionHandler(string option, string optionVal) pure @safe 113 { 114 import std.algorithm : findSplit; 115 import std.format : format; 116 117 auto valSplit = findSplit(optionVal, "="); 118 119 enforce(!valSplit[0].empty && !valSplit[2].empty, 120 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 121 option, optionVal, option)); 122 123 auto source = valSplit[0]; 124 auto filepath = valSplit[2]; 125 files ~= filepath; 126 fileSourceNames[filepath] = source; 127 } 128 129 /** Command line argument processing. 130 * 131 * Returns a tuple. First value is true if command line arguments were successfully 132 * processed and execution should continue, or false if an error occurred or the user 133 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 134 * 135 * Returning true (execution continues) means args have been validated and derived 136 * values calculated. In addition, field indices have been converted to zero-based. 137 * If the whole line is the key, the individual fields list will be cleared. 138 */ 139 auto processArgs (ref string[] cmdArgs) 140 { 141 import std.algorithm : any, each; 142 import std.getopt; 143 import std.path : baseName, stripExtension; 144 145 bool helpVerbose = false; // --help-verbose 146 bool versionWanted = false; // --V|version 147 148 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 149 150 try 151 { 152 arraySep = ","; // Use comma to separate values in command line options 153 auto r = getopt( 154 cmdArgs, 155 "help-verbose", " Print full help.", &helpVerbose, 156 std.getopt.config.caseSensitive, 157 "H|header", " Treat the first line of each file as a header.", &hasHeader, 158 std.getopt.config.caseInsensitive, 159 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 160 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 161 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 162 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 163 std.getopt.config.caseSensitive, 164 "V|version", " Print version information and exit.", &versionWanted, 165 std.getopt.config.caseInsensitive, 166 ); 167 168 if (r.helpWanted) 169 { 170 defaultGetoptPrinter(helpText, r.options); 171 return tuple(false, 0); 172 } 173 else if (helpVerbose) 174 { 175 defaultGetoptPrinter(helpTextVerbose, r.options); 176 return tuple(false, 0); 177 } 178 else if (versionWanted) 179 { 180 import tsv_utils.common.tsvutils_version; 181 writeln(tsvutilsVersionNotice("tsv-append")); 182 return tuple(false, 0); 183 } 184 185 /* Derivations and consistency checks. */ 186 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 187 if (!sourceHeader.empty) hasHeader = true; 188 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 189 190 /* Assume the remaing arguments are filepaths. */ 191 foreach (fp; cmdArgs[1 .. $]) 192 { 193 import std.path : baseName, stripExtension; 194 files ~= fp; 195 fileSourceNames[fp] = fp.stripExtension.baseName; 196 } 197 198 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 199 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 200 } 201 catch (Exception exc) 202 { 203 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 204 return tuple(false, 1); 205 } 206 return tuple(true, 0); 207 } 208 } 209 210 /** tsvAppend implements the basic functionality of the tsv-append program. 211 */ 212 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) 213 if (isOutputRange!(OutputRange, char)) 214 { 215 import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange; 216 217 bool headerWritten = false; 218 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 219 { 220 auto inputStream = (filename == "-") ? stdin : filename.File(); 221 auto sourceName = cmdopt.fileSourceNames[filename]; 222 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 223 { 224 if (cmdopt.hasHeader && fileLineNum == 1) 225 { 226 if (!headerWritten) 227 { 228 if (cmdopt.trackSource) 229 { 230 outputStream.put(cmdopt.sourceHeader); 231 outputStream.put(cmdopt.delim); 232 } 233 outputStream.put(line); 234 outputStream.put('\n'); 235 headerWritten = true; 236 237 /* Flush the header immediately. This helps tasks further on in a 238 * unix pipeline detect errors quickly, without waiting for all 239 * the data to flow through the pipeline. Note that an upstream 240 * task may have flushed its header line, so the header may 241 * arrive long before the main block of data. 242 */ 243 static if (isFlushableOutputRange!OutputRange) outputStream.flush; 244 } 245 } 246 else 247 { 248 if (cmdopt.trackSource) 249 { 250 outputStream.put(sourceName); 251 outputStream.put(cmdopt.delim); 252 } 253 outputStream.put(line); 254 outputStream.put('\n'); 255 } 256 } 257 /* Files don't always close quickly enough on thier own. */ 258 if (filename != "-") inputStream.close; 259 } 260 } 261 262 version(unittest) 263 { 264 /* Unit test helper functions. */ 265 266 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 267 268 void testTsvAppend(string[] cmdArgs, string[][] expected) 269 { 270 import std.array : appender; 271 import std.format : format; 272 273 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 274 275 auto formatAssertMessage(T...)(string msg, T formatArgs) 276 { 277 auto formatString = "[testTsvAppend] %s: " ~ msg; 278 return format(formatString, cmdArgs[0], formatArgs); 279 } 280 281 TsvAppendOptions cmdopt; 282 auto savedCmdArgs = cmdArgs.to!string; 283 auto r = cmdopt.processArgs(cmdArgs); 284 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 285 286 auto output = appender!(char[])(); 287 tsvAppend(cmdopt, output); 288 auto expectedOutput = expected.tsvDataToString; 289 290 assert(output.data == expectedOutput, 291 formatAssertMessage( 292 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 293 expectedOutput.to!string, output.data.to!string)); 294 } 295 } 296 297 unittest 298 { 299 import std.path : buildPath; 300 import std.file : rmdirRecurse; 301 import std.format : format; 302 303 auto testDir = makeUnittestTempDir("tsv_append"); 304 scope(exit) testDir.rmdirRecurse; 305 306 string[][] data1 = 307 [["field_a", "field_b", "field_c"], 308 ["red", "17", "κόκκινος"], 309 ["blue", "12", "άσπρο"]]; 310 311 string[][] data2 = 312 [["field_a", "field_b", "field_c"], 313 ["green", "13.5", "κόκκινος"], 314 ["blue", "15", "πράσινος"]]; 315 316 string[][] data3 = 317 [["field_a", "field_b", "field_c"], 318 ["yellow", "9", "κίτρινος"]]; 319 320 string[][] dataHeaderRowOnly = 321 [["field_a", "field_b", "field_c"]]; 322 323 string[][] dataEmpty = [[]]; 324 325 string filepath1 = buildPath(testDir, "file1.tsv"); 326 string filepath2 = buildPath(testDir, "file2.tsv"); 327 string filepath3 = buildPath(testDir, "file3.tsv"); 328 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 329 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 330 331 writeUnittestTsvFile(filepath1, data1); 332 writeUnittestTsvFile(filepath2, data2); 333 writeUnittestTsvFile(filepath3, data3); 334 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 335 writeUnittestTsvFile(filepathEmpty, dataEmpty); 336 337 testTsvAppend(["test-1", filepath1], data1); 338 testTsvAppend(["test-2", "--header", filepath1], data1); 339 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 340 341 testTsvAppend(["test-4", "--header", filepath1, filepath2], 342 [["field_a", "field_b", "field_c"], 343 ["red", "17", "κόκκινος"], 344 ["blue", "12", "άσπρο"], 345 ["green", "13.5", "κόκκινος"], 346 ["blue", "15", "πράσινος"]]); 347 348 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 349 [["field_a", "field_b", "field_c"], 350 ["red", "17", "κόκκινος"], 351 ["blue", "12", "άσπρο"], 352 ["green", "13.5", "κόκκινος"], 353 ["blue", "15", "πράσινος"], 354 ["yellow", "9", "κίτρινος"]]); 355 356 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 357 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 358 359 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 360 [["field_a", "field_b", "field_c"], 361 ["red", "17", "κόκκινος"], 362 ["blue", "12", "άσπρο"], 363 ["green", "13.5", "κόκκινος"], 364 ["blue", "15", "πράσινος"], 365 ["yellow", "9", "κίτρινος"]]); 366 367 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 368 [["file1", "field_a", "field_b", "field_c"], 369 ["file1", "red", "17", "κόκκινος"], 370 ["file1", "blue", "12", "άσπρο"], 371 ["file2", "field_a", "field_b", "field_c"], 372 ["file2", "green", "13.5", "κόκκινος"], 373 ["file2", "blue", "15", "πράσινος"]]); 374 375 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 376 [["file", "field_a", "field_b", "field_c"], 377 ["file1", "red", "17", "κόκκινος"], 378 ["file1", "blue", "12", "άσπρο"], 379 ["file2", "green", "13.5", "κόκκινος"], 380 ["file2", "blue", "15", "πράσινος"]]); 381 382 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 383 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 384 [["source", "field_a", "field_b", "field_c"], 385 ["file1", "red", "17", "κόκκινος"], 386 ["file1", "blue", "12", "άσπρο"], 387 ["file2", "green", "13.5", "κόκκινος"], 388 ["file2", "blue", "15", "πράσινος"], 389 ["file3", "yellow", "9", "κίτρινος"]]); 390 391 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 392 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 393 [["id", "field_a", "field_b", "field_c"], 394 ["1a", "red", "17", "κόκκινος"], 395 ["1a", "blue", "12", "άσπρο"], 396 ["1b", "green", "13.5", "κόκκινος"], 397 ["1b", "blue", "15", "πράσινος"], 398 ["1c", "yellow", "9", "κίτρινος"]]); 399 400 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 401 "-f", format("1b=%s", filepath2), filepath3], 402 [["id", "field_a", "field_b", "field_c"], 403 ["1a", "red", "17", "κόκκινος"], 404 ["1a", "blue", "12", "άσπρο"], 405 ["1b", "green", "13.5", "κόκκινος"], 406 ["1b", "blue", "15", "πράσινος"], 407 ["file3", "yellow", "9", "κίτρινος"]]); 408 }