commit 0dc32e63b23f7c1446be7abd466ea86aefb321c1 Author: Niko Medvesky Date: Sun Feb 2 00:04:04 2025 -0600 First working version v0.1.0 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..919a0d5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,433 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys", +] + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "bytesize" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" +dependencies = [ + "serde", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "4.5.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "console" +version = "0.15.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "duplicated" +version = "0.1.0" +dependencies = [ + "bytesize", + "clap", + "indicatif", + "seahash", +] + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "portable-atomic" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + +[[package]] +name = "serde" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.96" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" + +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..90c700e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "duplicated" +version = "0.1.0" +edition = "2021" + +[dependencies] +seahash = "4.1" +clap = { version = "4.5", features = ["derive"] } +bytesize = {version = "1.2.0", features = ["serde"]} +indicatif = "0.17.11" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a64c837 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,213 @@ +use std::error::Error; +use std::fs; +use std::path::{Path, PathBuf}; + +use bytesize::ByteSize; +use clap::Parser; +use indicatif::ProgressBar; + +extern crate bytesize; + +#[derive(Parser)] +#[command(version, about, long_about = None)] +pub struct Cli { + path: PathBuf, + + #[arg(short = 's', long, value_name = "SIZE")] + min_size: Option, + + #[arg(short = 'm', long, value_name = "SIZE")] + max_size: Option, +} + +pub fn run(args: Cli) -> Result<(), Box> { + let files = navigate_file_tree(&args.path, &args.min_size, &args.max_size)?; + println!("Scanning {} files", files.len()); + + let hashes = hash_files(&files)?; + let repeats = find_duplicates(&files, &hashes); + + if repeats.is_empty() { + println!("No duplicate files found!"); + return Ok(()); + } + + println!("{} Duplicate files found", repeats.len()); + for duplicate in repeats { + let size = ByteSize::b(fs::metadata(duplicate[0])?.len()); + let paths = duplicate + .into_iter() + .map(|path| path.to_string_lossy()) + .collect::>() + .join(" "); + + println!("{size} {paths}"); + } + + Ok(()) +} + +fn is_filesize_in_range( + filesize: u64, + min_size: &Option, + max_size: &Option, +) -> bool { + if let Some(size) = min_size { + if filesize < size.0 { + return false; + } + } + + if let Some(size) = max_size { + if filesize > size.0 { + return false; + } + } + + true +} + +pub fn navigate_file_tree( + path: &Path, + min_size: &Option, + max_size: &Option, +) -> Result, Box> { + let mut files: Vec = Vec::new(); + if !path.is_dir() { + let metadata = fs::metadata(path); + if let Err(err) = metadata { + eprintln!("{err}"); + return Ok(files); + } + + let filesize = metadata?.len(); + + if !is_filesize_in_range(filesize, min_size, max_size) { + return Ok(files); + } + + files.push(path.to_path_buf()); + return Ok(files); + } + + let children = fs::read_dir(path); + if let Err(err) = children { + eprintln!("{err}"); + return Ok(files); + } + + for child in children? { + let child = child?; + let child_path = child.path(); + if child_path.is_dir() { + files.append(&mut navigate_file_tree(&child_path, min_size, max_size)?); + } else { + let metadata = fs::metadata(&child_path); + if let Err(err) = metadata { + eprintln!("{err}"); + return Ok(files); + } + + let filesize = metadata?.len(); + if !is_filesize_in_range(filesize, min_size, max_size) { + continue; + } + + files.push(child_path.to_path_buf()); + } + } + + Ok(files) +} + +pub fn hash_files(files: &Vec) -> Result, Box> { + let pb = ProgressBar::new(files.len().try_into()?); + + let mut hashes: Vec = Vec::new(); + for file in files { + //let buf = fs::read(file).unwrap_or_else(); + let buf = match fs::read(file) { + Ok(t) => t, + Err(_e) => { + //eprint!("{e}"); + continue; + } + }; + + hashes.push(seahash::hash(&buf)); + + pb.inc(1); + } + + pb.finish_with_message("hashing finished"); + + Ok(hashes) +} + +pub fn find_duplicates<'a>(files: &'a [PathBuf], hashes: &[u64]) -> Vec> { + let mut repeated: Vec> = Vec::new(); + let mut checked_indices: Vec = Vec::new(); + + for (i, hash) in hashes.iter().enumerate() { + if checked_indices.contains(&i) { + continue; + } + + let mut duplicate_hashes: Vec<&PathBuf> = Vec::new(); + duplicate_hashes.push(&files[i]); + + // Check all hashes if equal + for (j, other_hash) in hashes.iter().enumerate() { + if hash == other_hash && i != j { + checked_indices.push(j); + duplicate_hashes.push(&files[j]); + } + } + if duplicate_hashes.len() > 1 { + repeated.push(duplicate_hashes) + } + } + + repeated +} + +#[cfg(test)] +mod test { + use super::*; + + // Tests navigate_file_tree + #[test] + fn nav_filetree() { + let path = PathBuf::from("./test-data"); + let results = navigate_file_tree(&path, &None, &None).unwrap(); + + let files = [ + PathBuf::from("./test-data/file1"), + PathBuf::from("./test-data/file2"), + PathBuf::from("./test-data/file3"), + ]; + + assert_eq!(results.len(), 3); + assert!(results.contains(&files[0])); + assert!(results.contains(&files[1])); + assert!(results.contains(&files[2])); + } + + // Tests hash_files and file_duplicates + #[test] + fn find_duplicate_files() { + let files = vec![ + PathBuf::from("./test-data/file1"), + PathBuf::from("./test-data/file2"), + PathBuf::from("./test-data/file3"), + ]; + + let hashes = hash_files(&files).unwrap(); + + let results = find_duplicates(&files, &hashes); + + assert_eq!(results.len(), 1); + assert!(results[0].contains(&&files[0])); + assert!(results[0].contains(&&files[1])); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..bceedd8 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,14 @@ +use std::process; + +use clap::Parser; +use duplicated::Cli; + +fn main() { + let args = Cli::parse(); + + if let Err(e) = duplicated::run(args) { + eprintln!("Application error: {e}"); + process::exit(1); + }; + +} diff --git a/test-data/file1 b/test-data/file1 new file mode 100644 index 0000000..8ab686e --- /dev/null +++ b/test-data/file1 @@ -0,0 +1 @@ +Hello, World! diff --git a/test-data/file2 b/test-data/file2 new file mode 100644 index 0000000..8ab686e --- /dev/null +++ b/test-data/file2 @@ -0,0 +1 @@ +Hello, World! diff --git a/test-data/file3 b/test-data/file3 new file mode 100644 index 0000000..e69de29