random- collection of un-sorted bollocks |
git clone git://git.acid.vegas/random.git |
Log | Files | Refs | Archive |
bigshuf (1755B)
1 #!/bin/sh 2 # bigshuf - developed by acidvegas (https://git.acid.vegas/random) 3 # shuffles the lines in large files, randomizing the order while using a memory-safe approach 4 5 # Check if enough arguments are provided 6 if [ "$#" -lt 3 ]; then 7 echo "Usage: $0 inputfile tempdir outputfile [lines per chunk]" >&2 8 exit 1 9 fi 10 11 # Parse input arguments 12 inputfile="$1" 13 tempdir="$2" 14 outputfile="$3" 15 lines_per_chunk="${4:-10000}" 16 17 # Check if input file exists 18 if [ ! -f "$inputfile" ]; then 19 echo "Error: Input file does not exist" >&2 20 exit 1 21 fi 22 23 # Calculate required and available space 24 required_space=$(( $(wc -c < "$inputfile") * 2 )) 25 available_space=$(df --block-size=1K --output=avail "$tempdir" | tail -n 1) 26 27 # Check if there is enough disk space in tempdir 28 if [ "$available_space" -lt "$required_space" ]; then 29 echo "Error: Not enough disk space in $tempdir" >&2 30 exit 1 31 fi 32 33 # Check if tempdir is writable 34 if [ ! -d "$tempdir" ]; then 35 mkdir -p "$tempdir" || { echo "Error: Unable to create temp directory" >&2; exit 1; } 36 elif [ ! -w "$tempdir" ]; then 37 echo "Error: Temp directory is not writable" >&2 38 exit 1 39 fi 40 41 # Split the file by lines 42 split -l "$lines_per_chunk" "$inputfile" "$tempdir/chunk_" || { echo "Error: Failed to split file" >&2; rm -rf "$tempdir"; exit 1; } 43 44 # Create a file with a shuffled list of chunk files 45 find "$tempdir" -name 'chunk_*' | shuf > "$tempdir/chunks_list.txt" || { echo "Error: Failed to create shuffled chunks list" >&2; rm -rf "$tempdir"; exit 1; } 46 47 # Shuffle each chunk based on the shuffled list and append to the output file 48 while read -r chunk; do 49 shuf "$chunk" >> "$outputfile" || { echo "Error: Failed to shuffle and append chunk $chunk" >&2; break; } 50 done < "$tempdir/chunks_list.txt" 51 52 # Clean up 53 rm -rf "$tempdir"