1 /** 2 Command line tool that appends multiple TSV files. It is header aware and supports 3 tracking the original source file of each row. 4 5 Copyright (c) 2017-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_append; 11 12 import std.conv : to; 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 int main(string[] cmdArgs) 24 { 25 import tsvutil : BufferedOutputRange; 26 /* When running in DMD code coverage mode, turn on report merging. */ 27 version(D_Coverage) version(DigitalMars) 28 { 29 import core.runtime : dmd_coverSetMerge; 30 dmd_coverSetMerge(true); 31 } 32 33 TsvAppendOptions cmdopt; 34 auto r = cmdopt.processArgs(cmdArgs); 35 if (!r[0]) return r[1]; 36 try tsvAppend(cmdopt, BufferedOutputRange!(typeof(stdout))(stdout)); 37 catch (Exception exc) 38 { 39 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 40 return 1; 41 } 42 return 0; 43 } 44 } 45 46 auto helpTextVerbose = q"EOS 47 Synopsis: tsv-append [options] [file...] 48 49 tsv-append concatenates multiple TSV files, similar to the Unix 'cat' utility. 50 Unlike 'cat', it is header aware ('--H|header'), writing the header from only 51 the first file. It also supports source tracking, adding a column indicating 52 the original file to each row. Results are written to standard output. 53 54 Concatenation with header support is useful when preparing data for traditional 55 Unix utilities like 'sort' and 'sed' or applications that read a single file. 56 57 Source tracking is useful when creating long/narrow form tabular data, a format 58 used by many statistics and data mining packages. In this scenario, files have 59 been used to capture related data sets, the difference between data sets being a 60 condition represented by the file. For example, results from different variants 61 of an experiment might each be recorded in their own files. Retaining the source 62 file as an output column preserves the condition represented by the file. 63 64 The file-name (without extension) is used as the source value. This can 65 customized using the --f|file option. 66 67 Example: Header processing: 68 69 $ tsv-append -H file1.tsv file2.tsv file3.tsv 70 71 Example: Header processing and source tracking: 72 73 $ tsv-append -H -t file1.tsv file2.tsv file3.tsv 74 75 Example: Source tracking with custom values: 76 77 $ tsv-append -H -s test_id -f test1=file1.tsv -f test2=file2.tsv 78 79 Options: 80 EOS"; 81 82 auto helpText = q"EOS 83 Synopsis: tsv-append [options] [file...] 84 85 tsv-append concatenates multiple TSV files, reading from files or standard input 86 and writing to standard output. It is header aware ('--H|header'), writing the 87 header from only the first file. It also supports source tracking, adding an 88 indicator of original file to each row of input. 89 90 Options: 91 EOS"; 92 93 struct TsvAppendOptions 94 { 95 string programName; 96 string[] files; // Input files 97 string[string] fileSourceNames; // Maps file path to the 'source' value 98 bool helpVerbose = false; // --help-verbose 99 string sourceHeader; // --s|source-header 100 bool trackSource = false; // --t|track-source 101 bool hasHeader = false; // --H|header 102 char delim = '\t'; // --d|delimiter 103 bool versionWanted = false; // --V|version 104 105 /* fileOptionHandler processes the '--f|file source=file' option. */ 106 private void fileOptionHandler(string option, string optionVal) 107 { 108 import std.algorithm : findSplit; 109 import std.format : format; 110 111 auto valSplit = findSplit(optionVal, "="); 112 if (valSplit[0].empty || valSplit[2].empty) 113 throw new Exception( 114 format("Invalid option value: '--%s %s'. Expected: '--%s <source>=<file>'.", 115 option, optionVal, option)); 116 117 auto source = valSplit[0]; 118 auto filepath = valSplit[2]; 119 files ~= filepath; 120 fileSourceNames[filepath] = source; 121 } 122 123 /* Returns a tuple. First value is true if command line arguments were successfully 124 * processed and execution should continue, or false if an error occurred or the user 125 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 126 * 127 * Returning true (execution continues) means args have been validated and derived 128 * values calculated. In addition, field indices have been converted to zero-based. 129 * If the whole line is the key, the individual fields list will be cleared. 130 */ 131 auto processArgs (ref string[] cmdArgs) 132 { 133 import std.algorithm : any, each; 134 import std.getopt; 135 import std.path : baseName, stripExtension; 136 137 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 138 139 try 140 { 141 arraySep = ","; // Use comma to separate values in command line options 142 auto r = getopt( 143 cmdArgs, 144 "help-verbose", " Print full help.", &helpVerbose, 145 std.getopt.config.caseSensitive, 146 "H|header", " Treat the first line of each file as a header.", &hasHeader, 147 std.getopt.config.caseInsensitive, 148 "t|track-source", " Track the source file. Adds an column with the source name.", &trackSource, 149 "s|source-header", "STR Use STR as the header for the source column. Implies --H|header and --t|track-source. Default: 'file'", &sourceHeader, 150 "f|file", "STR=FILE Read file FILE, using STR as the 'source' value. Implies --t|track-source.", &fileOptionHandler, 151 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 152 std.getopt.config.caseSensitive, 153 "V|version", " Print version information and exit.", &versionWanted, 154 std.getopt.config.caseInsensitive, 155 ); 156 157 if (r.helpWanted) 158 { 159 defaultGetoptPrinter(helpText, r.options); 160 return tuple(false, 0); 161 } 162 else if (helpVerbose) 163 { 164 defaultGetoptPrinter(helpTextVerbose, r.options); 165 return tuple(false, 0); 166 } 167 else if (versionWanted) 168 { 169 import tsvutils_version; 170 writeln(tsvutilsVersionNotice("tsv-append")); 171 return tuple(false, 0); 172 } 173 174 /* Derivations and consistency checks. */ 175 if (files.length > 0 || !sourceHeader.empty) trackSource = true; 176 if (!sourceHeader.empty) hasHeader = true; 177 if (hasHeader && sourceHeader.empty) sourceHeader = "file"; 178 179 /* Assume the remaing arguments are filepaths. */ 180 foreach (fp; cmdArgs[1 .. $]) 181 { 182 import std.path : baseName, stripExtension; 183 files ~= fp; 184 fileSourceNames[fp] = fp.stripExtension.baseName; 185 } 186 187 /* Add a name mapping for dash ('-') unless it was included in the --file option. */ 188 if ("-" !in fileSourceNames) fileSourceNames["-"] = "stdin"; 189 } 190 catch (Exception exc) 191 { 192 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 193 return tuple(false, 1); 194 } 195 return tuple(true, 0); 196 } 197 } 198 199 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, OutputRange outputStream) 200 if (isOutputRange!(OutputRange, char)) 201 { 202 bool headerWritten = false; 203 foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"]) 204 { 205 auto inputStream = (filename == "-") ? stdin : filename.File(); 206 auto sourceName = cmdopt.fileSourceNames[filename]; 207 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 208 { 209 if (cmdopt.hasHeader && fileLineNum == 1) 210 { 211 if (!headerWritten) 212 { 213 if (cmdopt.trackSource) 214 { 215 outputStream.put(cmdopt.sourceHeader); 216 outputStream.put(cmdopt.delim); 217 } 218 outputStream.put(line); 219 outputStream.put('\n'); 220 headerWritten = true; 221 } 222 } 223 else 224 { 225 if (cmdopt.trackSource) 226 { 227 outputStream.put(sourceName); 228 outputStream.put(cmdopt.delim); 229 } 230 outputStream.put(line); 231 outputStream.put('\n'); 232 } 233 } 234 } 235 } 236 237 version(unittest) 238 { 239 /* Unit test helper functions. */ 240 241 import unittest_utils; // tsv unit test helpers, from common/src/. 242 243 void testTsvAppend(string[] cmdArgs, string[][] expected) 244 { 245 import std.array : appender; 246 import std.format : format; 247 248 assert(cmdArgs.length > 0, "[testTsvAppend] cmdArgs must not be empty."); 249 250 auto formatAssertMessage(T...)(string msg, T formatArgs) 251 { 252 auto formatString = "[testTsvAppend] %s: " ~ msg; 253 return format(formatString, cmdArgs[0], formatArgs); 254 } 255 256 TsvAppendOptions cmdopt; 257 auto savedCmdArgs = cmdArgs.to!string; 258 auto r = cmdopt.processArgs(cmdArgs); 259 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 260 261 auto output = appender!(char[])(); 262 tsvAppend(cmdopt, output); 263 auto expectedOutput = expected.tsvDataToString; 264 265 assert(output.data == expectedOutput, 266 formatAssertMessage( 267 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 268 expectedOutput.to!string, output.data.to!string)); 269 } 270 } 271 272 unittest 273 { 274 import std.path : buildPath; 275 import std.file : rmdirRecurse; 276 import std.format : format; 277 278 auto testDir = makeUnittestTempDir("tsv_append"); 279 scope(exit) testDir.rmdirRecurse; 280 281 string[][] data1 = 282 [["field_a", "field_b", "field_c"], 283 ["red", "17", "κόκκινος"], 284 ["blue", "12", "άσπρο"]]; 285 286 string[][] data2 = 287 [["field_a", "field_b", "field_c"], 288 ["green", "13.5", "κόκκινος"], 289 ["blue", "15", "πράσινος"]]; 290 291 string[][] data3 = 292 [["field_a", "field_b", "field_c"], 293 ["yellow", "9", "κίτρινος"]]; 294 295 string[][] dataHeaderRowOnly = 296 [["field_a", "field_b", "field_c"]]; 297 298 string[][] dataEmpty = [[]]; 299 300 string filepath1 = buildPath(testDir, "file1.tsv"); 301 string filepath2 = buildPath(testDir, "file2.tsv"); 302 string filepath3 = buildPath(testDir, "file3.tsv"); 303 string filepathHeaderRowOnly = buildPath(testDir, "fileHeaderRowOnly.tsv"); 304 string filepathEmpty = buildPath(testDir, "fileEmpty.tsv"); 305 306 writeUnittestTsvFile(filepath1, data1); 307 writeUnittestTsvFile(filepath2, data2); 308 writeUnittestTsvFile(filepath3, data3); 309 writeUnittestTsvFile(filepathHeaderRowOnly, dataHeaderRowOnly); 310 writeUnittestTsvFile(filepathEmpty, dataEmpty); 311 312 testTsvAppend(["test-1", filepath1], data1); 313 testTsvAppend(["test-2", "--header", filepath1], data1); 314 testTsvAppend(["test-3", filepath1, filepath2], data1 ~ data2); 315 316 testTsvAppend(["test-4", "--header", filepath1, filepath2], 317 [["field_a", "field_b", "field_c"], 318 ["red", "17", "κόκκινος"], 319 ["blue", "12", "άσπρο"], 320 ["green", "13.5", "κόκκινος"], 321 ["blue", "15", "πράσινος"]]); 322 323 testTsvAppend(["test-5", "--header", filepath1, filepath2, filepath3], 324 [["field_a", "field_b", "field_c"], 325 ["red", "17", "κόκκινος"], 326 ["blue", "12", "άσπρο"], 327 ["green", "13.5", "κόκκινος"], 328 ["blue", "15", "πράσινος"], 329 ["yellow", "9", "κίτρινος"]]); 330 331 testTsvAppend(["test-6", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 332 data1 ~ dataEmpty ~ data2 ~ dataHeaderRowOnly ~ data3); 333 334 testTsvAppend(["test-7", "--header", filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 335 [["field_a", "field_b", "field_c"], 336 ["red", "17", "κόκκινος"], 337 ["blue", "12", "άσπρο"], 338 ["green", "13.5", "κόκκινος"], 339 ["blue", "15", "πράσινος"], 340 ["yellow", "9", "κίτρινος"]]); 341 342 testTsvAppend(["test-8", "--track-source", filepath1, filepath2], 343 [["file1", "field_a", "field_b", "field_c"], 344 ["file1", "red", "17", "κόκκινος"], 345 ["file1", "blue", "12", "άσπρο"], 346 ["file2", "field_a", "field_b", "field_c"], 347 ["file2", "green", "13.5", "κόκκινος"], 348 ["file2", "blue", "15", "πράσινος"]]); 349 350 testTsvAppend(["test-9", "--header", "--track-source", filepath1, filepath2], 351 [["file", "field_a", "field_b", "field_c"], 352 ["file1", "red", "17", "κόκκινος"], 353 ["file1", "blue", "12", "άσπρο"], 354 ["file2", "green", "13.5", "κόκκινος"], 355 ["file2", "blue", "15", "πράσινος"]]); 356 357 testTsvAppend(["test-10", "-H", "-t", "--source-header", "source", 358 filepath1, filepathEmpty, filepath2, filepathHeaderRowOnly, filepath3], 359 [["source", "field_a", "field_b", "field_c"], 360 ["file1", "red", "17", "κόκκινος"], 361 ["file1", "blue", "12", "άσπρο"], 362 ["file2", "green", "13.5", "κόκκινος"], 363 ["file2", "blue", "15", "πράσινος"], 364 ["file3", "yellow", "9", "κίτρινος"]]); 365 366 testTsvAppend(["test-11", "-H", "-t", "-s", "id", "--file", format("1a=%s", filepath1), 367 "--file", format("1b=%s", filepath2), "--file", format("1c=%s", filepath3)], 368 [["id", "field_a", "field_b", "field_c"], 369 ["1a", "red", "17", "κόκκινος"], 370 ["1a", "blue", "12", "άσπρο"], 371 ["1b", "green", "13.5", "κόκκινος"], 372 ["1b", "blue", "15", "πράσινος"], 373 ["1c", "yellow", "9", "κίτρινος"]]); 374 375 testTsvAppend(["test-12", "-s", "id", "-f", format("1a=%s", filepath1), 376 "-f", format("1b=%s", filepath2), filepath3], 377 [["id", "field_a", "field_b", "field_c"], 378 ["1a", "red", "17", "κόκκινος"], 379 ["1a", "blue", "12", "άσπρο"], 380 ["1b", "green", "13.5", "κόκκινος"], 381 ["1b", "blue", "15", "πράσινος"], 382 ["file3", "yellow", "9", "κίτρινος"]]); 383 }