random

- collection of un-sorted bollocks
git clone git://git.acid.vegas/random.git
Log | Files | Refs | Archive

bigshuf (1755B)

      1 #!/bin/sh
      2 # bigshuf - developed by acidvegas (https://git.acid.vegas/random)
      3 # shuffles the lines in large files, randomizing the order while using a memory-safe approach
      4 
      5 # Check if enough arguments are provided
      6 if [ "$#" -lt 3 ]; then
      7 	echo "Usage: $0 inputfile tempdir outputfile [lines per chunk]" >&2
      8 	exit 1
      9 fi
     10 
     11 # Parse input arguments
     12 inputfile="$1"
     13 tempdir="$2"
     14 outputfile="$3"
     15 lines_per_chunk="${4:-10000}"
     16 
     17 # Check if input file exists
     18 if [ ! -f "$inputfile" ]; then
     19 	echo "Error: Input file does not exist" >&2
     20 	exit 1
     21 fi
     22 
     23 # Calculate required and available space
     24 required_space=$(( $(wc -c < "$inputfile") * 2 ))
     25 available_space=$(df --block-size=1K --output=avail "$tempdir" | tail -n 1)
     26 
     27 # Check if there is enough disk space in tempdir
     28 if [ "$available_space" -lt "$required_space" ]; then
     29 	echo "Error: Not enough disk space in $tempdir" >&2
     30 	exit 1
     31 fi
     32 
     33 # Check if tempdir is writable
     34 if [ ! -d "$tempdir" ]; then
     35 	mkdir -p "$tempdir" || { echo "Error: Unable to create temp directory" >&2; exit 1; }
     36 elif [ ! -w "$tempdir" ]; then
     37 	echo "Error: Temp directory is not writable" >&2
     38 	exit 1
     39 fi
     40 
     41 # Split the file by lines
     42 split -l "$lines_per_chunk" "$inputfile" "$tempdir/chunk_" || { echo "Error: Failed to split file" >&2; rm -rf "$tempdir"; exit 1; }
     43 
     44 # Create a file with a shuffled list of chunk files
     45 find "$tempdir" -name 'chunk_*' | shuf > "$tempdir/chunks_list.txt" || { echo "Error: Failed to create shuffled chunks list" >&2; rm -rf "$tempdir"; exit 1; }
     46 
     47 # Shuffle each chunk based on the shuffled list and append to the output file
     48 while read -r chunk; do
     49 	shuf "$chunk" >> "$outputfile" || { echo "Error: Failed to shuffle and append chunk $chunk" >&2; break; }
     50 done < "$tempdir/chunks_list.txt"
     51 
     52 # Clean up
     53 rm -rf "$tempdir"