1 /** 2 Command line tool that identifies equivalent lines in an input stream. Equivalent 3 lines are identified using either the full line or a set of fields as the key. By 4 default, input is written to standard output, retaining only the first occurrence of 5 equivalent lines. There are also options for marking and numbering equivalent lines 6 rather, without filtering out duplicates. 7 8 This tool is similar in spirit to the Unix 'uniq' tool, with some key differences. 9 First, the key can be composed of individual fields, not just the full line. Second, 10 input does not need to be sorted. (Unix 'uniq' only detects equivalent lines when 11 they are adjacent, hence the usual need for sorting.) 12 13 There are a couple alternative to uniq'ing the input lines. One is to mark lines with 14 an equivalence ID, which is a one-upped counter. The other is to number lines, with 15 each unique key have its own set of numbers. 16 17 Copyright (c) 2015-2020, eBay Inc. 18 Initially written by Jon Degenhardt 19 20 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 21 */ 22 module tsv_utils.tsv_uniq; 23 24 import std.stdio; 25 import std.format : format; 26 import std.typecons : tuple; 27 28 auto helpText = q"EOS 29 Synopsis: tsv-uniq [options] [file...] 30 31 tsv-uniq filters out duplicate lines using fields as a key. Filtering is 32 based on the entire line when key fields are not provided. Options are 33 also available for assigning a unique id to each key and numbering the 34 occurrences of each key. Use '--help-verbose' for more details. 35 36 Options: 37 EOS"; 38 39 auto helpTextVerbose = q"EOS 40 Synopsis: tsv-uniq [options] [file...] 41 42 tsv-uniq identifies equivalent lines in tab-separated value files. Input 43 is read line by line, recording a key for each line based on one or more 44 of the fields. Two lines are equivalent if they have the same key. The 45 first time a key is seen its line is written to standard output. 46 Subsequent lines containing the same key are discarded. This command 47 uniq's a file on fields 2 and 3: 48 49 tsv-uniq -f 2,3 file.tsv 50 51 This is similar to the Unix 'uniq' program, but based on individual 52 fields and without requiring sorted data. 53 54 tsv-uniq can be run without specifying a key field. In this case the 55 whole line is used as a key, same as the Unix 'uniq' program. This works 56 on any line-oriented text file, not just TSV files. 57 58 The above is the default behavior ('uniq' mode). The alternates to 'uniq' 59 mode are 'number' mode and 'equiv-class' mode. In 'equiv-class' mode, all 60 lines are written to standard output, but with a field appended marking 61 equivalent entries with an ID. The ID is a one-upped counter. Example: 62 63 tsv-uniq --header -f 2,3 --equiv file.tsv 64 65 'Number' mode also writes all lines to standard output, but with a field 66 appended numbering the occurrence count for the line's key. The first line 67 with a specific key is assigned the number '1', the second with the key is 68 assigned number '2', etc. 'Number' and 'equiv-class' modes can be combined. 69 70 The '--r|repeated' option can be used to print only lines occurring more 71 than once. Specifically, the second occurrence of a key is printed. The 72 '--a|at-least N' option is similar, printing lines occurring at least N 73 times. (Like repeated, the Nth line with the key is printed.) 74 75 The '--m|max MAX' option changes the behavior to output the first MAX 76 lines for each key, rather than just the first line for each key. 77 78 If both '--a|at-least' and '--m|max' are specified, the occurrences 79 starting with 'at-least' and ending with 'max' are output. 80 81 Options: 82 EOS"; 83 84 /** Container for command line options. 85 */ 86 struct TsvUniqOptions 87 { 88 enum defaultEquivHeader = "equiv_id"; 89 enum defaultEquivStartID = 1; 90 enum defaultNumberHeader = "equiv_line"; 91 92 string programName; 93 bool helpVerbose = false; // --h|help-verbose 94 bool versionWanted = false; // --V|version 95 size_t[] fields; // --f|fields 96 bool hasHeader = false; // --H|header 97 bool onlyRepeated = false; // --r|repeated. Shorthand for '--atleast 2' 98 size_t atLeast = 0; // --a|at-least. Zero implies default behavior. 99 size_t max = 0; // --m|max. Zero implies default behavior. 100 bool numberMode = false; // --z|number 101 string numberHeader = defaultNumberHeader; // --number-header 102 bool equivMode = false; // --e|equiv 103 string equivHeader = defaultEquivHeader; // --equiv-header 104 long equivStartID = defaultEquivStartID; // --equiv-start 105 bool ignoreCase = false; // --i|ignore-case 106 char delim = '\t'; // --d|delimiter 107 bool keyIsFullLine = false; // Derived. True if no fields specified or '--f|fields 0' 108 109 /* Returns a tuple. First value is true if command line arguments were successfully 110 * processed and execution should continue, or false if an error occurred or the user 111 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 112 * 113 * Returning true (execution continues) means args have been validated and derived 114 * values calculated. In addition, field indices have been converted to zero-based. 115 * If the whole line is the key, the individual fields list will be cleared. 116 * 117 * Repeat count control variables 'atLeast' and max' - These values are left at zero 118 * if no repeat count options are specified. They are set if repeat count options 119 * are specified, as follows: 120 * * atLeast - Will be zero unless --r|repeated or --a|at-least is specified. 121 * --r|repeated option sets it 2, --a|at-least sets it to the specified value. 122 * * max - Default to zero. Is set to the --m|max value if provided. Is set to 123 * 'atLeast' if --r|repeated or --a|at-least is provided. 124 * 125 * An exception to the above: If --e|equiv-mode is specified, then (max == 0) 126 * represents the default "output all values" case. In this case max may be less 127 * than the at-least value. 128 */ 129 auto processArgs (ref string[] cmdArgs) 130 { 131 import std.algorithm : any, each; 132 import std.getopt; 133 import std.path : baseName, stripExtension; 134 import std.typecons : Yes, No; 135 import tsv_utils.common.utils : makeFieldListOptionHandler; 136 137 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 138 139 try 140 { 141 arraySep = ","; // Use comma to separate values in command line options 142 auto r = getopt( 143 cmdArgs, 144 "help-verbose", " Print full help.", &helpVerbose, 145 std.getopt.config.caseSensitive, 146 "V|version", " Print version information and exit.", &versionWanted, 147 "H|header", " Treat the first line of each file as a header.", &hasHeader, 148 std.getopt.config.caseInsensitive, 149 150 "f|fields", "<field-list> Fields to use as the key. Default: 0 (entire line).", 151 fields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex,Yes.allowFieldNumZero), 152 153 "i|ignore-case", " Ignore case when comparing keys.", &ignoreCase, 154 "r|repeated", " Output only lines that are repeated (based on the key).", &onlyRepeated, 155 "a|at-least", "INT Output only lines that are repeated INT times (based on the key). Zero and one are ignored.", &atLeast, 156 "m|max", "INT Max number of each unique key to output (zero is ignored).", &max, 157 "e|equiv", " Output equivalence class IDs rather than uniq'ing entries.", &equivMode, 158 "equiv-header", "STR Use STR as the equiv-id field header (when using '-H --equiv'). Default: 'equiv_id'.", &equivHeader, 159 "equiv-start", "INT Use INT as the first equiv-id. Default: 1.", &equivStartID, 160 "z|number", " Output equivalence class occurrence counts rather than uniq'ing entries.", &numberMode, 161 "number-header", "STR Use STR as the '--number' field header (when using '-H --number)'. Default: 'equiv_line'.", &numberHeader, 162 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 163 ); 164 165 if (r.helpWanted) 166 { 167 defaultGetoptPrinter(helpText, r.options); 168 return tuple(false, 0); 169 } 170 else if (helpVerbose) 171 { 172 defaultGetoptPrinter(helpTextVerbose, r.options); 173 return tuple(false, 0); 174 } 175 else if (versionWanted) 176 { 177 import tsv_utils.common.tsvutils_version; 178 writeln(tsvutilsVersionNotice("tsv-uniq")); 179 return tuple(false, 0); 180 } 181 182 /* Consistency checks */ 183 if (!equivMode) 184 { 185 if (equivHeader != defaultEquivHeader) 186 { 187 throw new Exception("--equiv-header requires --e|equiv"); 188 } 189 else if (equivStartID != defaultEquivStartID) 190 { 191 throw new Exception("--equiv-start requires --e|equiv"); 192 } 193 } 194 195 if (!numberMode && numberHeader != defaultNumberHeader) 196 { 197 throw new Exception("--number-header requires --z|number"); 198 } 199 200 if (fields.length > 1 && fields.any!(x => x == 0)) 201 { 202 throw new Exception("Whole line as key (--f|field 0) cannot be combined with multiple fields."); 203 } 204 205 /* Derivations */ 206 if (fields.length == 0) 207 { 208 keyIsFullLine = true; 209 } 210 else if (fields.length == 1 && fields[0] == 0) 211 { 212 keyIsFullLine = true; 213 fields.length = 0; 214 } 215 216 if (onlyRepeated && atLeast <= 1) atLeast = 2; 217 if (atLeast >= 2 && max < atLeast) 218 { 219 // Don't modify max if it is zero and equivMode or numberMode is in effect. 220 if (max != 0 || (!equivMode && !numberMode)) max = atLeast; 221 } 222 223 if (!keyIsFullLine) fields.each!((ref x) => --x); // Convert to 0-based indexing. 224 225 } 226 catch (Exception exc) 227 { 228 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 229 return tuple(false, 1); 230 } 231 return tuple(true, 0); 232 } 233 } 234 235 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 236 237 /** Main program. Processes command line arguments and calls tsvUniq which implements 238 * the main processing logic. 239 */ 240 int main(string[] cmdArgs) 241 { 242 /* When running in DMD code coverage mode, turn on report merging. */ 243 version(D_Coverage) version(DigitalMars) 244 { 245 import core.runtime : dmd_coverSetMerge; 246 dmd_coverSetMerge(true); 247 } 248 249 TsvUniqOptions cmdopt; 250 auto r = cmdopt.processArgs(cmdArgs); 251 if (!r[0]) return r[1]; 252 253 version(LDC_Profile) 254 { 255 import ldc.profile : resetAll; 256 resetAll(); 257 } 258 259 try tsvUniq(cmdopt, cmdArgs[1..$]); 260 catch (Exception exc) 261 { 262 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 263 return 1; 264 } 265 return 0; 266 } 267 268 /** Outputs the unique lines from all the input files. 269 * 270 * Processes the lines in each input file. All lines are added to an associated array. 271 * The first time a line is seen it is output. If key fields are being used these are 272 * used as the basis for the associative array entries rather than the full line. 273 */ 274 void tsvUniq(const TsvUniqOptions cmdopt, const string[] inputFiles) 275 { 276 import tsv_utils.common.utils : InputFieldReordering, bufferedByLine, BufferedOutputRange, joinAppend; 277 import std.algorithm : splitter; 278 import std.array : appender; 279 import std.conv : to; 280 import std.range; 281 import std.uni : asLowerCase; 282 import std.utf : byChar; 283 284 /* InputFieldReordering maps the key fields from an input line to a separate buffer. */ 285 auto keyFieldsReordering = cmdopt.keyIsFullLine ? null : new InputFieldReordering!char(cmdopt.fields); 286 287 /* BufferedOutputRange is a performance enhancement for writing to stdout. */ 288 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 289 290 /* The master hash. The key is the specified fields concatenated together (including 291 * separators). The value is a struct with the equiv-id and occurrence count. 292 */ 293 static struct EquivEntry { size_t equivID; size_t count; } 294 EquivEntry[string] equivHash; 295 296 /* Reusable buffers for multi-field keys and case-insensitive keys. */ 297 auto multiFieldKeyBuffer = appender!(char[]); 298 auto lowerKeyBuffer = appender!(char[]); 299 300 const size_t numKeyFields = cmdopt.fields.length; 301 long nextEquivID = cmdopt.equivStartID; 302 bool headerWritten = false; 303 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 304 { 305 auto inputStream = (filename == "-") ? stdin : filename.File(); 306 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 307 { 308 if (cmdopt.hasHeader && lineNum == 1) 309 { 310 /* Header line. */ 311 if (!headerWritten) 312 { 313 bufferedOutput.append(line); 314 315 if (cmdopt.equivMode) 316 { 317 bufferedOutput.append(cmdopt.delim); 318 bufferedOutput.append(cmdopt.equivHeader); 319 } 320 321 if (cmdopt.numberMode) 322 { 323 bufferedOutput.append(cmdopt.delim); 324 bufferedOutput.append(cmdopt.numberHeader); 325 } 326 327 bufferedOutput.appendln(); 328 headerWritten = true; 329 } 330 } 331 else 332 { 333 /* Regular line (not header). Start by finding the key. */ 334 typeof(line) key; 335 if (cmdopt.keyIsFullLine) 336 { 337 key = line; 338 } 339 else 340 { 341 assert(keyFieldsReordering !is null); 342 343 /* Copy the key fields to a new buffer. */ 344 keyFieldsReordering.initNewLine; 345 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 346 { 347 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 348 if (keyFieldsReordering.allFieldsFilled) break; 349 } 350 351 if (!keyFieldsReordering.allFieldsFilled) 352 { 353 throw new Exception( 354 format("Not enough fields in line. File: %s, Line: %s", 355 (filename == "-") ? "Standard Input" : filename, lineNum)); 356 } 357 358 if (numKeyFields == 1) 359 { 360 key = keyFieldsReordering.outputFields[0]; 361 } 362 else 363 { 364 multiFieldKeyBuffer.clear(); 365 keyFieldsReordering.outputFields.joinAppend(multiFieldKeyBuffer, cmdopt.delim); 366 key = multiFieldKeyBuffer.data; 367 } 368 } 369 370 if (cmdopt.ignoreCase) 371 { 372 /* Equivalent to key = key.toLower, but without memory allocation. */ 373 lowerKeyBuffer.clear(); 374 lowerKeyBuffer.put(key.asLowerCase.byChar); 375 key = lowerKeyBuffer.data; 376 } 377 378 bool isOutput = false; 379 EquivEntry currEntry; 380 EquivEntry* priorEntry = (key in equivHash); 381 if (priorEntry is null) 382 { 383 isOutput = (cmdopt.atLeast <= 1); 384 currEntry.equivID = nextEquivID; 385 currEntry.count = 1; 386 equivHash[key.to!string] = currEntry; 387 nextEquivID++; 388 } 389 else 390 { 391 (*priorEntry).count++; 392 currEntry = *priorEntry; 393 394 if ((currEntry.count <= cmdopt.max && currEntry.count >= cmdopt.atLeast) || 395 (cmdopt.equivMode && cmdopt.max == 0) || 396 (cmdopt.numberMode && cmdopt.max == 0)) 397 { 398 isOutput = true; 399 } 400 } 401 402 if (isOutput) 403 { 404 bufferedOutput.append(line); 405 406 if (cmdopt.equivMode) 407 { 408 bufferedOutput.append(cmdopt.delim); 409 bufferedOutput.append(currEntry.equivID.to!string); 410 } 411 412 if (cmdopt.numberMode) 413 { 414 bufferedOutput.append(cmdopt.delim); 415 bufferedOutput.append(currEntry.count.to!string); 416 } 417 418 bufferedOutput.appendln(); 419 } 420 } 421 } 422 } 423 }