1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_append; 11 12 import std.conv : to; 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple; 16 17 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 18 19 version(unittest) 20 { 21 // When running unit tests, use main from -main compiler switch. 22 } 23 else 24 { 25 /** Main program. Invokes command line arg processing and tsv-append to perform 26 * the real work. Any errors are caught and reported. 27 */ 28 int main(string[] cmdArgs) 29 { 30 import tsv_utils.common.utils : BufferedOutputRange; 31 /* When running in DMD code coverage mode, turn on report merging. */ 32 version(D_Coverage) version(DigitalMars) 33 { 34 import core.runtime : dmd_coverSetMerge; 35 dmd_coverSetMerge(true); 36 } 37 38 TsvAppendOptions cmdopt; 39 auto r = cmdopt.processArgs(cmdArgs); 40 if (!r[0]) return r[1]; 41 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); 42 catch (Exception exc) 43 { 44 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 45 return 1; 46 } 47 return 0; 48 } 49 } 50 51 auto helpTextVerbose = q"EOS 52 Synopsis: tsv-append [options] [file...] 53 54 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 55 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 56 the first file. It also supports source tracking, adding a column indicating 57 the original file to each row. Results are written to standard output. 58 59 Concatenation with header support is useful when preparing data for traditional 60 Unix utilities like 'sort' and 'sed' or applications that read a single file. 61 62 Source tracking is useful when creating long/narrow form tabular data, a format 63 used by many statistics and data mining packages. In this scenario, files have 64 been used to capture related data sets, the difference between data sets being a 65 condition represented by the file. For example, results from different variants 66 of an experiment might each be recorded in their own files. Retaining the source 67 file as an output column preserves the condition represented by the file. 68 69 The file-name (without extension) is used as the source value. This can 70 customized using the --f|file option. 71 72 Example: Header processing: 73 74 $ tsv-append -H file1.tsv file2.tsv file3.tsv 75 76 Example: Header processing and source tracking: 77 78 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 79 80 Example: Source tracking with custom values: 81 82 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 83 84 Options: 85 EOS"; 86 87 auto helpText = q"EOS 88 Synopsis: tsv-append [options] [file...] 89 90 tsv-append concatenates multiple TSV files, reading from files or standard input 91 and writing to standard output. It is header aware ('--H|header'), writing the 92 header from only the first file. It also supports source tracking, adding an 93 indicator of original file to each row of input. 94 95 Options: 96 EOS"; 97 98 /** Container for command line options. 99 */ 100 struct TsvAppendOptions 101 { 102 string programName; 103 string[] files; // Input files 104 string[string] fileSourceNames; // Maps file path to the 'source' value 105 bool helpVerbose = false; // --help-verbose 106 string sourceHeader; // --s|source-header 107 bool trackSource = false; // --t|track-source 108 bool hasHeader = false; // --H|header 109 char delim = '\t'; // --d|delimiter 110 bool versionWanted = false; // --V|version 111 112 /* fileOptionHandler processes the '--f|file source=file' option. */ 113 private void fileOptionHandler(string option, string optionVal) pure @safe 114 { 115 import std.algorithm : findSplit; 116 import std.format : format; 117 118 auto valSplit = findSplit(optionVal, "="); 119 if (valSplit[0].empty || valSplit[2].empty) 120 throw new Exception( 121 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 122 option, optionVal, option)); 123 124 auto source = valSplit[0]; 125 auto filepath = valSplit[2]; 126 files ~= filepath; 127 fileSourceNames[filepath] = source; 128 } 129 130 /** Command line argument processing. 131 * 132 * Returns a tuple. First value is true if command line arguments were successfully 133 * processed and execution should continue, or false if an error occurred or the user 134 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 135 * 136 * Returning true (execution continues) means args have been validated and derived 137 * values calculated. In addition, field indices have been converted to zero-based. 138 * If the whole line is the key, the individual fields list will be cleared. 139 */ 140 auto processArgs (ref string[] cmdArgs) 141 { 142 import std.algorithm : any, each; 143 import std.getopt; 144 import std.path : baseName, stripExtension; 145 146 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 147 148 try 149 { 150 arraySep = ","; // Use comma to separate values in command line options 151 auto r = getopt( 152 cmdArgs, 153 "help-verbose", " Print full help.", &helpVerbose, 154 std.getopt.config.caseSensitive, 155 "H|header", " Treat the first line of each file as a header.", &hasHeader, 156 std.getopt.config.caseInsensitive, 157 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 158 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 159 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 160 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 161 std.getopt.config.caseSensitive, 162 "V|version", " Print version information and exit.", &versionWanted, 163 std.getopt.config.caseInsensitive, 164 ); 165 166 if (r.helpWanted) 167 { 168 defaultGetoptPrinter(helpText, r.options); 169 return tuple(false, 0); 170 } 171 else if (helpVerbose) 172 { 173 defaultGetoptPrinter(helpTextVerbose, r.options); 174 return tuple(false, 0); 175 } 176 else if (versionWanted) 177 { 178 import tsv_utils.common.tsvutils_version; 179 writeln(tsvutilsVersionNotice("tsv-append")); 180 return tuple(false, 0); 181 } 182 183 /* Derivations and consistency checks. */ 184 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 185 if (!sourceHeader.empty) hasHeader = true; 186 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 187 188 /* Assume the remaing arguments are filepaths. */ 189 foreach (fp; cmdArgs[1 .. $]) 190 { 191 import std.path : baseName, stripExtension; 192 files ~= fp; 193 fileSourceNames[fp] = fp.stripExtension.baseName; 194 } 195 196 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 197 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 198 } 199 catch (Exception exc) 200 { 201 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 202 return tuple(false, 1); 203 } 204 return tuple(true, 0); 205 } 206 } 207 208 /** tsvAppend implements the basic functionality of the tsv-append program. 209 */ 210 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream) 211 if (isOutputRange!(OutputRange, char)) 212 { 213 import tsv_utils.common.utils : bufferedByLine; 214 215 bool headerWritten = false; 216 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 217 { 218 auto inputStream = (filename == "-") ? stdin : filename.File(); 219 auto sourceName = cmdopt.fileSourceNames[filename]; 220 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 221 { 222 if (cmdopt.hasHeader && fileLineNum == 1) 223 { 224 if (!headerWritten) 225 { 226 if (cmdopt.trackSource) 227 { 228 outputStream.put(cmdopt.sourceHeader); 229 outputStream.put(cmdopt.delim); 230 } 231 outputStream.put(line); 232 outputStream.put('\n'); 233 headerWritten = true; 234 } 235 } 236 else 237 { 238 if (cmdopt.trackSource) 239 { 240 outputStream.put(sourceName); 241 outputStream.put(cmdopt.delim); 242 } 243 outputStream.put(line); 244 outputStream.put('\n'); 245 } 246 } 247 } 248 } 249 250 version(unittest) 251 { 252 /* Unit test helper functions. */ 253 254 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 255 256 void testTsvAppend(string[] cmdArgs, string[][] expected) 257 { 258 import std.array : appender; 259 import std.format : format; 260 261 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 262 263 auto formatAssertMessage(T...)(string msg, T formatArgs) 264 { 265 auto formatString = "[testTsvAppend] %s: " ~ msg; 266 return format(formatString, cmdArgs[0], formatArgs); 267 } 268 269 TsvAppendOptions cmdopt; 270 auto savedCmdArgs = cmdArgs.to!string; 271 auto r = cmdopt.processArgs(cmdArgs); 272 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 273 274 auto output = appender!(char[])(); 275 tsvAppend(cmdopt, output); 276 auto expectedOutput = expected.tsvDataToString; 277 278 assert(output.data == expectedOutput, 279 formatAssertMessage( 280 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 281 expectedOutput.to!string, output.data.to!string)); 282 } 283 } 284 285 unittest 286 { 287 import std.path : buildPath; 288 import std.file : rmdirRecurse; 289 import std.format : format; 290 291 auto testDir = makeUnittestTempDir("tsv_append"); 292 scope(exit) testDir.rmdirRecurse; 293 294 string[][] data1 = 295 [["field_a", "field_b", "field_c"], 296 ["red", "17", "κόκκινος"], 297 ["blue", "12", "άσπρο"]]; 298 299 string[][] data2 = 300 [["field_a", "field_b", "field_c"], 301 ["green", "13.5", "κόκκινος"], 302 ["blue", "15", "πράσινος"]]; 303 304 string[][] data3 = 305 [["field_a", "field_b", "field_c"], 306 ["yellow", "9", "κίτρινος"]]; 307 308 string[][] dataHeaderRowOnly = 309 [["field_a", "field_b", "field_c"]]; 310 311 string[][] dataEmpty = [[]]; 312 313 string filepath1 = buildPath(testDir, "file1.tsv"); 314 string filepath2 = buildPath(testDir, "file2.tsv"); 315 string filepath3 = buildPath(testDir, "file3.tsv"); 316 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 317 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 318 319 writeUnittestTsvFile(filepath1, data1); 320 writeUnittestTsvFile(filepath2, data2); 321 writeUnittestTsvFile(filepath3, data3); 322 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 323 writeUnittestTsvFile(filepathEmpty, dataEmpty); 324 325 testTsvAppend(["test-1", filepath1], data1); 326 testTsvAppend(["test-2", "--header", filepath1], data1); 327 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 328 329 testTsvAppend(["test-4", "--header", filepath1, filepath2], 330 [["field_a", "field_b", "field_c"], 331 ["red", "17", "κόκκινος"], 332 ["blue", "12", "άσπρο"], 333 ["green", "13.5", "κόκκινος"], 334 ["blue", "15", "πράσινος"]]); 335 336 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 337 [["field_a", "field_b", "field_c"], 338 ["red", "17", "κόκκινος"], 339 ["blue", "12", "άσπρο"], 340 ["green", "13.5", "κόκκινος"], 341 ["blue", "15", "πράσινος"], 342 ["yellow", "9", "κίτρινος"]]); 343 344 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 345 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 346 347 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 348 [["field_a", "field_b", "field_c"], 349 ["red", "17", "κόκκινος"], 350 ["blue", "12", "άσπρο"], 351 ["green", "13.5", "κόκκινος"], 352 ["blue", "15", "πράσινος"], 353 ["yellow", "9", "κίτρινος"]]); 354 355 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 356 [["file1", "field_a", "field_b", "field_c"], 357 ["file1", "red", "17", "κόκκινος"], 358 ["file1", "blue", "12", "άσπρο"], 359 ["file2", "field_a", "field_b", "field_c"], 360 ["file2", "green", "13.5", "κόκκινος"], 361 ["file2", "blue", "15", "πράσινος"]]); 362 363 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 364 [["file", "field_a", "field_b", "field_c"], 365 ["file1", "red", "17", "κόκκινος"], 366 ["file1", "blue", "12", "άσπρο"], 367 ["file2", "green", "13.5", "κόκκινος"], 368 ["file2", "blue", "15", "πράσινος"]]); 369 370 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 371 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 372 [["source", "field_a", "field_b", "field_c"], 373 ["file1", "red", "17", "κόκκινος"], 374 ["file1", "blue", "12", "άσπρο"], 375 ["file2", "green", "13.5", "κόκκινος"], 376 ["file2", "blue", "15", "πράσινος"], 377 ["file3", "yellow", "9", "κίτρινος"]]); 378 379 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 380 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 381 [["id", "field_a", "field_b", "field_c"], 382 ["1a", "red", "17", "κόκκινος"], 383 ["1a", "blue", "12", "άσπρο"], 384 ["1b", "green", "13.5", "κόκκινος"], 385 ["1b", "blue", "15", "πράσινος"], 386 ["1c", "yellow", "9", "κίτρινος"]]); 387 388 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 389 "-f", format("1b=%s", filepath2), filepath3], 390 [["id", "field_a", "field_b", "field_c"], 391 ["1a", "red", "17", "κόκκινος"], 392 ["1a", "blue", "12", "άσπρο"], 393 ["1b", "green", "13.5", "κόκκινος"], 394 ["1b", "blue", "15", "πράσινος"], 395 ["file3", "yellow", "9", "κίτρινος"]]); 396 }