First working version v0.1.0

This commit is contained in:
2025-02-02 00:04:04 -06:00
commit 0dc32e63b2
8 changed files with 673 additions and 0 deletions

213
src/lib.rs Normal file
View File

@@ -0,0 +1,213 @@
use std::error::Error;
use std::fs;
use std::path::{Path, PathBuf};
use bytesize::ByteSize;
use clap::Parser;
use indicatif::ProgressBar;
extern crate bytesize;
#[derive(Parser)]
#[command(version, about, long_about = None)]
pub struct Cli {
path: PathBuf,
#[arg(short = 's', long, value_name = "SIZE")]
min_size: Option<ByteSize>,
#[arg(short = 'm', long, value_name = "SIZE")]
max_size: Option<ByteSize>,
}
pub fn run(args: Cli) -> Result<(), Box<dyn Error>> {
let files = navigate_file_tree(&args.path, &args.min_size, &args.max_size)?;
println!("Scanning {} files", files.len());
let hashes = hash_files(&files)?;
let repeats = find_duplicates(&files, &hashes);
if repeats.is_empty() {
println!("No duplicate files found!");
return Ok(());
}
println!("{} Duplicate files found", repeats.len());
for duplicate in repeats {
let size = ByteSize::b(fs::metadata(duplicate[0])?.len());
let paths = duplicate
.into_iter()
.map(|path| path.to_string_lossy())
.collect::<Vec<_>>()
.join(" ");
println!("{size} {paths}");
}
Ok(())
}
fn is_filesize_in_range(
filesize: u64,
min_size: &Option<ByteSize>,
max_size: &Option<ByteSize>,
) -> bool {
if let Some(size) = min_size {
if filesize < size.0 {
return false;
}
}
if let Some(size) = max_size {
if filesize > size.0 {
return false;
}
}
true
}
pub fn navigate_file_tree(
path: &Path,
min_size: &Option<ByteSize>,
max_size: &Option<ByteSize>,
) -> Result<Vec<PathBuf>, Box<dyn Error>> {
let mut files: Vec<PathBuf> = Vec::new();
if !path.is_dir() {
let metadata = fs::metadata(path);
if let Err(err) = metadata {
eprintln!("{err}");
return Ok(files);
}
let filesize = metadata?.len();
if !is_filesize_in_range(filesize, min_size, max_size) {
return Ok(files);
}
files.push(path.to_path_buf());
return Ok(files);
}
let children = fs::read_dir(path);
if let Err(err) = children {
eprintln!("{err}");
return Ok(files);
}
for child in children? {
let child = child?;
let child_path = child.path();
if child_path.is_dir() {
files.append(&mut navigate_file_tree(&child_path, min_size, max_size)?);
} else {
let metadata = fs::metadata(&child_path);
if let Err(err) = metadata {
eprintln!("{err}");
return Ok(files);
}
let filesize = metadata?.len();
if !is_filesize_in_range(filesize, min_size, max_size) {
continue;
}
files.push(child_path.to_path_buf());
}
}
Ok(files)
}
pub fn hash_files(files: &Vec<PathBuf>) -> Result<Vec<u64>, Box<dyn Error>> {
let pb = ProgressBar::new(files.len().try_into()?);
let mut hashes: Vec<u64> = Vec::new();
for file in files {
//let buf = fs::read(file).unwrap_or_else();
let buf = match fs::read(file) {
Ok(t) => t,
Err(_e) => {
//eprint!("{e}");
continue;
}
};
hashes.push(seahash::hash(&buf));
pb.inc(1);
}
pb.finish_with_message("hashing finished");
Ok(hashes)
}
pub fn find_duplicates<'a>(files: &'a [PathBuf], hashes: &[u64]) -> Vec<Vec<&'a PathBuf>> {
let mut repeated: Vec<Vec<&PathBuf>> = Vec::new();
let mut checked_indices: Vec<usize> = Vec::new();
for (i, hash) in hashes.iter().enumerate() {
if checked_indices.contains(&i) {
continue;
}
let mut duplicate_hashes: Vec<&PathBuf> = Vec::new();
duplicate_hashes.push(&files[i]);
// Check all hashes if equal
for (j, other_hash) in hashes.iter().enumerate() {
if hash == other_hash && i != j {
checked_indices.push(j);
duplicate_hashes.push(&files[j]);
}
}
if duplicate_hashes.len() > 1 {
repeated.push(duplicate_hashes)
}
}
repeated
}
#[cfg(test)]
mod test {
use super::*;
// Tests navigate_file_tree
#[test]
fn nav_filetree() {
let path = PathBuf::from("./test-data");
let results = navigate_file_tree(&path, &None, &None).unwrap();
let files = [
PathBuf::from("./test-data/file1"),
PathBuf::from("./test-data/file2"),
PathBuf::from("./test-data/file3"),
];
assert_eq!(results.len(), 3);
assert!(results.contains(&files[0]));
assert!(results.contains(&files[1]));
assert!(results.contains(&files[2]));
}
// Tests hash_files and file_duplicates
#[test]
fn find_duplicate_files() {
let files = vec![
PathBuf::from("./test-data/file1"),
PathBuf::from("./test-data/file2"),
PathBuf::from("./test-data/file3"),
];
let hashes = hash_files(&files).unwrap();
let results = find_duplicates(&files, &hashes);
assert_eq!(results.len(), 1);
assert!(results[0].contains(&&files[0]));
assert!(results[0].contains(&&files[1]));
}
}

14
src/main.rs Normal file
View File

@@ -0,0 +1,14 @@
use std::process;
use clap::Parser;
use duplicated::Cli;
fn main() {
let args = Cli::parse();
if let Err(e) = duplicated::run(args) {
eprintln!("Application error: {e}");
process::exit(1);
};
}