1 /** 2 Command line tool that identifies equivalent lines in an input stream. Equivalent 3 lines are identified using either the full line or a set of fields as the key. By 4 default, input is written to standard output, retaining only the first occurrence of 5 equivalent lines. There are also options for marking and numbering equivalent lines 6 rather, without filtering out duplicates. 7 8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences. 9 First, the key can be composed of individual fields, not just the full line. Second, 10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when 11 they are adjacent, hence the usual need for sorting.) 12 13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with 14 an equivalence ID, which is a one-upped counter. The other is to number lines, with 15 each unique key have its own set of numbers. 16 17 Copyright (c) 2015-2021, eBay Inc. 18 Initially written by Jon Degenhardt 19 20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 21 */ 22 module tsv_utils.tsv_uniq; 23 24 import std.exception : enforce; 25 import std.format : format; 26 import std.range; 27 import std.stdio; 28 import std.typecons : tuple; 29 30 auto helpText = q"EOS 31 Synopsis: tsv-uniq [options] [file...] 32 33 tsv-uniq filters out duplicate lines using fields as a key. Filtering is 34 based on the entire line when key fields are not provided. Options are 35 also available for assigning a unique id to each key and numbering the 36 occurrences of each key. Fields are specified using field number or field 37 name. Field names can be used when the input file has a header line. 38 39 Use '--help-verbose' for more details. 40 Options: 41 EOS"; 42 43 auto helpTextVerbose = q"EOS 44 Synopsis: tsv-uniq [options] [file...] 45 46 tsv-uniq identifies equivalent lines in tab-separated value files. Input 47 is read line by line, recording a key for each line based on one or more 48 of the fields. Two lines are equivalent if they have the same key. The 49 first time a key is seen its line is written to standard output. 50 Subsequent lines containing the same key are discarded. This command 51 uniq's a file on fields 2 and 3: 52 53 tsv-uniq -f 2,3 file.tsv 54 55 This is similar to the Unix 'uniq' program, but based on individual 56 fields and without requiring sorted data. 57 58 Field names can be used if the input file has a header line. This command 59 uniq's a file based on the 'time' and 'date' fields: 60 61 tsv-uniq -H -f time,date file.tsv 62 63 Use '--help-fields' for details about field names. 64 65 tsv-uniq can be run without specifying a key field. In this case the 66 whole line is used as a key, same as the Unix 'uniq' program. This works 67 on any line-oriented text file, not just TSV files. 68 69 The above is the default behavior ('uniq' mode). The alternates to 'uniq' 70 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all 71 lines are written to standard output, but with a field appended marking 72 equivalent entries with an ID. The ID is a one-upped counter. Example: 73 74 tsv-uniq --header -f 2,3 --equiv file.tsv 75 76 'Number' mode also writes all lines to standard output, but with a field 77 appended numbering the occurrence count for the line's key. The first line 78 with a specific key is assigned the number '1', the second with the key is 79 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined. 80 81 The '--r|repeated' option can be used to print only lines occurring more 82 than once. Specifically, the second occurrence of a key is printed. The 83 '--a|at-least N' option is similar, printing lines occurring at least N 84 times. (Like repeated, the Nth line with the key is printed.) 85 86 The '--m|max MAX' option changes the behavior to output the first MAX 87 lines for each key, rather than just the first line for each key. 88 89 If both '--a|at-least' and '--m|max' are specified, the occurrences 90 starting with 'at-least' and ending with 'max' are output. 91 92 Options: 93 EOS"; 94 95 /** Container for command line options. 96 */ 97 struct TsvUniqOptions 98 { 99 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 100 101 enum defaultEquivHeader = "equiv_id"; 102 enum defaultEquivStartID = 1; 103 enum defaultNumberHeader = "equiv_line"; 104 105 string programName; 106 InputSourceRange inputSources; /// Input files 107 size_t[] fields; /// Derived: --f|fields 108 bool hasHeader = false; /// --H|header 109 bool onlyRepeated = false; /// --r|repeated. Shorthand for '--atleast 2' 110 size_t atLeast = 0; /// --a|at-least. Zero implies default behavior. 111 size_t max = 0; /// --m|max. Zero implies default behavior. 112 bool numberMode = false; /// --z|number 113 string numberHeader = defaultNumberHeader; /// --number-header 114 bool equivMode = false; /// --e|equiv 115 string equivHeader = defaultEquivHeader; /// --equiv-header 116 long equivStartID = defaultEquivStartID; /// --equiv-start 117 bool ignoreCase = false; /// --i|ignore-case 118 char delim = '\t'; /// --d|delimiter 119 bool lineBuffered = false; /// --line-buffered 120 bool keyIsFullLine = false; /// Derived. True if no fields specified or '--f|fields 0' 121 122 /* Returns a tuple. First value is true if command line arguments were successfully 123 * processed and execution should continue, or false if an error occurred or the user 124 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 125 * 126 * Returning true (execution continues) means args have been validated and derived 127 * values calculated. In addition, field indices have been converted to zero-based. 128 * If the whole line is the key, the individual fields list will be cleared. 129 * 130 * Repeat count control variables 'atLeast' and max' - These values are left at zero 131 * if no repeat count options are specified. They are set if repeat count options 132 * are specified, as follows: 133 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 134 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 135 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 136 * 'atLeast' if --r|repeated or --a|at-least is provided. 137 * 138 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 139 * represents the default "output all values" case. In this case max may be less 140 * than the at-least value. 141 */ 142 auto processArgs (ref string[] cmdArgs) 143 { 144 import std.algorithm : all, each; 145 import std.conv : to; 146 import std.getopt; 147 import std.path : baseName, stripExtension; 148 import std.typecons : Yes, No; 149 import tsv_utils.common.fieldlist; 150 import tsv_utils.common.utils : throwIfWindowsNewline; 151 152 bool helpVerbose = false; // --h|help-verbose 153 bool helpFields = false; // --help-fields 154 bool versionWanted = false; // --V|version 155 string fieldsArg; // --f|fields 156 157 string fieldsOptionString = "f|fields"; 158 159 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 160 161 try 162 { 163 arraySep = ","; // Use comma to separate values in command line options 164 auto r = getopt( 165 cmdArgs, 166 "help-verbose", " Print full help.", &helpVerbose, 167 "help-fields", " Print help on specifying fields.", &helpFields, 168 std.getopt.config.caseSensitive, 169 "V|version", " Print version information and exit.", &versionWanted, 170 "H|header", " Treat the first line of each file as a header.", &hasHeader, 171 std.getopt.config.caseInsensitive, 172 173 fieldsOptionString, "<field-list> Fields to use as the key. Default: 0 (entire line).", &fieldsArg, 174 175 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 176 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 177 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 178 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 179 "e|equiv", " Output equivalence class IDs rather than uniq'ing entries.", &equivMode, 180 "equiv-header", "STR Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader, 181 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 182 "z|number", " Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode, 183 "number-header", "STR Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader, 184 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 185 "line-buffered", " Immediately output every line.", &lineBuffered, 186 ); 187 188 if (r.helpWanted) 189 { 190 defaultGetoptPrinter(helpText, r.options); 191 return tuple(false, 0); 192 } 193 else if (helpVerbose) 194 { 195 defaultGetoptPrinter(helpTextVerbose, r.options); 196 return tuple(false, 0); 197 } 198 else if (helpFields) 199 { 200 writeln(fieldListHelpText); 201 return tuple(false, 0); 202 } 203 else if (versionWanted) 204 { 205 import tsv_utils.common.tsvutils_version; 206 writeln(tsvutilsVersionNotice("tsv-uniq")); 207 return tuple(false, 0); 208 } 209 210 /* Input files. Remaining command line args are files. */ 211 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 212 cmdArgs.length = 1; 213 214 /* Validation - Do as much validation prior to header line processing as 215 * possible (avoids waiting on stdin). 216 */ 217 if (!equivMode) 218 { 219 enforce(equivHeader == defaultEquivHeader, "--equiv-header requires --e|equiv"); 220 enforce(equivStartID == defaultEquivStartID, "--equiv-start requires --e|equiv"); 221 } 222 223 enforce(numberMode || numberHeader == defaultNumberHeader, 224 "--number-header requires --z|number"); 225 226 string[] headerFields; 227 228 /* fieldListArgProcessing encapsulates the field list processing. It is 229 * called prior to reading the header line if headers are not being used, 230 * and after if headers are being used. 231 */ 232 void fieldListArgProcessing() 233 { 234 if (!fieldsArg.empty) 235 { 236 fields = 237 fieldsArg 238 .parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero) 239 (hasHeader, headerFields, fieldsOptionString) 240 .array; 241 } 242 243 enforce(fields.length <= 1 || fields.all!(x => x != 0), 244 "Whole line as key (--f|field 0) cannot be combined with multiple fields."); 245 246 if (fields.length == 0) 247 { 248 keyIsFullLine = true; 249 } 250 else if (fields.length == 1 && fields[0] == 0) 251 { 252 keyIsFullLine = true; 253 fields.length = 0; 254 } 255 256 if (onlyRepeated && atLeast <= 1) atLeast = 2; 257 if (atLeast >= 2 && max < atLeast) 258 { 259 // Don't modify max if it is zero and equivMode or numberMode is in effect. 260 if (max != 0 || (!equivMode && !numberMode)) max = atLeast; 261 } 262 263 if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. 264 } 265 266 if (!hasHeader) fieldListArgProcessing(); 267 268 /* 269 * Create the inputSourceRange and perform header line processing. 270 */ 271 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 272 inputSources = inputSourceRange(filepaths, readHeader); 273 274 if (hasHeader) 275 { 276 throwIfWindowsNewline(inputSources.front.header, inputSources.front.name, 1); 277 headerFields = inputSources.front.header.split(delim).to!(string[]); 278 fieldListArgProcessing(); 279 } 280 } 281 catch (Exception exc) 282 { 283 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 284 return tuple(false, 1); 285 } 286 return tuple(true, 0); 287 } 288 } 289 290 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 291 292 /** Main program. Processes command line arguments and calls tsvUniq which implements 293 * the main processing logic. 294 */ 295 int main(string[] cmdArgs) 296 { 297 /* When running in DMD code coverage mode, turn on report merging. */ 298 version(D_Coverage) version(DigitalMars) 299 { 300 import core.runtime : dmd_coverSetMerge; 301 dmd_coverSetMerge(true); 302 } 303 304 TsvUniqOptions cmdopt; 305 auto r = cmdopt.processArgs(cmdArgs); 306 if (!r[0]) return r[1]; 307 308 version(LDC_Profile) 309 { 310 import ldc.profile : resetAll; 311 resetAll(); 312 } 313 314 try tsvUniq(cmdopt); 315 catch (Exception exc) 316 { 317 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 318 return 1; 319 } 320 return 0; 321 } 322 323 /** Outputs the unique lines from all the input files. 324 * 325 * Processes the lines in each input file. All lines are added to an associated array. 326 * The first time a line is seen it is output. If key fields are being used these are 327 * used as the basis for the associative array entries rather than the full line. 328 */ 329 void tsvUniq(ref TsvUniqOptions cmdopt) 330 { 331 import tsv_utils.common.utils : bufferedByLine, BufferedOutputRange, InputFieldReordering, 332 InputSourceRange, joinAppend, LineBuffered, throwIfWindowsNewline; 333 import std.algorithm : splitter; 334 import std.array : appender; 335 import std.conv : to; 336 import std.uni : asLowerCase; 337 import std.utf : byChar; 338 339 /* inputSources must be an InputSourceRange and include at least stdin. */ 340 assert(!cmdopt.inputSources.empty); 341 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 342 343 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 344 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); 345 346 /* Both input and output are read one line at a time when in line-buffered mode. */ 347 immutable LineBuffered isLineBuffered = cmdopt.lineBuffered ? Yes.lineBuffered : No.lineBuffered; 348 349 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 350 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout, isLineBuffered); 351 352 /* The master hash. The key is the specified fields concatenated together (including 353 * separators). The value is a struct with the equiv-id and occurrence count. 354 */ 355 static struct EquivEntry { size_t equivID; size_t count; } 356 EquivEntry[string] equivHash; 357 358 /* Reusable buffers for multi-field keys and case-insensitive keys. */ 359 auto multiFieldKeyBuffer = appender!(char[]); 360 auto lowerKeyBuffer = appender!(char[]); 361 362 const size_t numKeyFields = cmdopt.fields.length; 363 long nextEquivID = cmdopt.equivStartID; 364 365 /* First header is read during command line arg processing. Flush it immediately 366 * so subsequent processes in a unix command pipeline see it early. This helps 367 * provide timely error messages. 368 */ 369 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 370 { 371 auto inputStream = cmdopt.inputSources.front; 372 373 bufferedOutput.append(inputStream.header); 374 375 if (cmdopt.equivMode) bufferedOutput.append(cmdopt.delim, cmdopt.equivHeader); 376 if (cmdopt.numberMode) bufferedOutput.append(cmdopt.delim, cmdopt.numberHeader); 377 378 bufferedOutput.appendln(); 379 bufferedOutput.flush(); 380 } 381 382 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 383 384 foreach (inputStream; cmdopt.inputSources) 385 { 386 if (cmdopt.hasHeader) throwIfWindowsNewline(inputStream.header, inputStream.name, 1); 387 388 foreach (lineNum, line; 389 inputStream 390 .file 391 .bufferedByLine(isLineBuffered) 392 .enumerate(fileBodyStartLine)) 393 { 394 if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum); 395 396 /* Start by finding the key. */ 397 typeof(line) key; 398 if (cmdopt.keyIsFullLine) 399 { 400 key = line; 401 } 402 else 403 { 404 assert(keyFieldsReordering !is null); 405 406 /* Copy the key fields to a new buffer. */ 407 keyFieldsReordering.initNewLine; 408 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 409 { 410 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 411 if (keyFieldsReordering.allFieldsFilled) break; 412 } 413 414 enforce(keyFieldsReordering.allFieldsFilled, 415 format("Not enough fields in line. File: %s, Line: %s", 416 inputStream.name, lineNum)); 417 418 if (numKeyFields == 1) 419 { 420 key = keyFieldsReordering.outputFields[0]; 421 } 422 else 423 { 424 multiFieldKeyBuffer.clear(); 425 keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); 426 key = multiFieldKeyBuffer.data; 427 } 428 } 429 430 if (cmdopt.ignoreCase) 431 { 432 /* Equivalent to key = key.toLower, but without memory allocation. */ 433 lowerKeyBuffer.clear(); 434 lowerKeyBuffer.put(key.asLowerCase.byChar); 435 key = lowerKeyBuffer.data; 436 } 437 438 bool isOutput = false; 439 EquivEntry currEntry; 440 EquivEntry* priorEntry = (key in equivHash); 441 if (priorEntry is null) 442 { 443 isOutput = (cmdopt.atLeast <= 1); 444 currEntry.equivID = nextEquivID; 445 currEntry.count = 1; 446 equivHash[key.to!string] = currEntry; 447 nextEquivID++; 448 } 449 else 450 { 451 (*priorEntry).count++; 452 currEntry = *priorEntry; 453 454 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 455 (cmdopt.equivMode && cmdopt.max == 0) || 456 (cmdopt.numberMode && cmdopt.max == 0)) 457 { 458 isOutput = true; 459 } 460 } 461 462 if (isOutput) 463 { 464 bufferedOutput.append(line); 465 466 if (cmdopt.equivMode) 467 { 468 bufferedOutput.append(cmdopt.delim, currEntry.equivID.to!string); 469 } 470 471 if (cmdopt.numberMode) 472 { 473 bufferedOutput.append(cmdopt.delim, currEntry.count.to!string); 474 } 475 476 bufferedOutput.appendln(); 477 } 478 } 479 } 480 }