大文件去重

小文件可以加载到内存一块去重, 大文件不行.

大文件去重步骤:

  1. 定义小文件大小 计算小文件总数
  2. 大文件按规则拆分到小文件  
  3. 小文件去重 
  4. 小文件合并为大文件

代码:

package com.util.file;

import com.util.FileUtils;
import lombok.Builder;
import lombok.SneakyThrows;
import org.apache.commons.lang3.StringUtils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.TreeSet;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
 * @description: 大文件去重
 * @author: AlbertXe
 * @create: 2020-10-21 10:23
 */
@Builder(toBuilder = true)
public class BigFileDistinctUtil {
    @Builder.Default
    private int size = 1 << 20;
    @Builder.Default
    private String charset = "gbk";
    @Builder.Default
    private List<File> tempFiles = new CopyOnWriteArrayList<>();
    private Function<String, String> function;


    public void uniq(File file) {
        if (!file.exists()) {
            throw new RuntimeException("文件不存在");
        }
        // 拆分小文件  小文件去重  合并  删除临时文件
        division(file);

        tempFiles.parallelStream().forEach(f -> uniqLittleFile(f));

        FileUtils.combineFiles(tempFiles, file);

        tempFiles.parallelStream().forEach(f -> f.delete());
    }

    private void uniqLittleFile(File file) {
        String path = file.getParent() + "/" + file.getName() + ".tmp";
        List<String> lines = FileUtils.lines(file, charset);
        ArrayList<String> resultList = lines.stream().collect(Collectors.collectingAndThen(Collectors.toCollection(() -> new TreeSet<>(Comparator.comparing(line -> function.apply(line)))), ArrayList::new));
        String result = resultList.stream().collect(Collectors.joining(System.lineSeparator()));
        if (StringUtils.isNotBlank(result)) {
            result += System.lineSeparator();
        }
        File tempFile = FileUtils.getFile(path);
        FileUtils.write(tempFile, result, charset);
        FileUtils.copyFile(tempFile, file);
        tempFile.delete();
    }

    @SneakyThrows
    private void division(File file) {
        long length = file.length();
        int fileSum = (int) (length / size + 1);
        List<BufferedWriter> writers = new ArrayList<>();
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset))) {
            for (int i = 0; i < fileSum; i++) {
                String tempFile = file.getParent() + "/" + file.getName() + "_" + i + ".temp";
                tempFiles.add(FileUtils.getFile(tempFile));
                BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), charset));
                writers.add(writer);
            }
            String line;
            while ((line = reader.readLine()) != null) {
                String key = function.apply(line);
                int hash = key.hashCode() & Integer.MAX_VALUE;
                int i = hash % fileSum;
                writers.get(i).write(line + System.lineSeparator());
            }
        } finally {
            writers.parallelStream().forEach(writer -> {
                try {
                    writer.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            });
        }
    }
}
package com.util;

import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;

import javax.swing.filechooser.FileSystemView;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.List;

/**
 * @author AlbertXe
 * @date 2019-11-21 21:20
 */
@Slf4j
public class FileUtils {

    public static void main(String[] args) {
        getCD();
    }

    /**
     * NIO方式copy
     *
     * @param source
     * @param target
     * @throws IOException
     */
    public static void copyNIO(String source, String target) throws IOException {
        try (FileInputStream is = new FileInputStream(source)) {
            try (FileOutputStream os = new FileOutputStream(target)) {
                FileChannel outChannel = os.getChannel();
                FileChannel inChannel = is.getChannel();
                ByteBuffer buffer = ByteBuffer.allocate(4086);
                while (inChannel.read(buffer) != -1) {
                    buffer.flip();
                    outChannel.write(buffer);
                    buffer.clear();
                }
            }
        }
    }


    /**
     * 可以创建多层目录文件
     *
     * @param path
     * @return
     * @throws IOException
     */
    @SneakyThrows
    public static File getFile(String path) {
        File file = new File(path);
        if (file.exists()) {
            return file;
        } else {
            file.getParentFile().mkdirs();
            file.createNewFile();
            return file;
        }
    }

    /**
     * 获得系统盘符
     */
    public static void getCD() {
        FileSystemView view = FileSystemView.getFileSystemView();

        File[] roots = File.listRoots();
        for (int i = 0; i < roots.length; i++) {
            System.out.println("盘符:" + view.getSystemDisplayName(roots[i]));
            System.out.println("总大小:" + formatSize(roots[i].getTotalSpace()));
            System.out.println("剩余大小:" + formatSize(roots[i].getFreeSpace()));
        }
    }

    /**
     * 格式化盘符大小
     *
     * @param size
     * @return
     */
    public static String formatSize(long size) {
        DecimalFormat df = new DecimalFormat("#.00");
        if (size < 1024) {
            return df.format(size) + "B";
        } else if (size < 1024 * 1024) {
            return df.format((double) size / 1024) + "KB";
        } else if (size < 1024 * 1024 * 1024) {
            return df.format((double) size / (1024 * 1024)) + "M";
        } else {
            return df.format((double) size / (1024 * 1024 * 1024)) + "G";
        }
    }
//    全局规则glob
//    使用类似于正则表达式但语法更简单的模式,匹配路径的字符串。
//
//    glob:*.java 匹配以java结尾的文件
//    glob:*.* 匹配包含'.'的文件
//    glob:*.{java,class} 匹配以java或class结尾的文件
//    glob:foo.? 匹配以foo开头且一个字符扩展名的文件
//    glob:/home/*/* 在unix平台上匹配,例如/home/gus/data等
//    glob:/home/** 在unix平台上匹配,例如/home/gus,/home/gus/data
//    glob:c:\\* 在windows平台上匹配,例如c:foo,c:bar,注意字符串转义
//    规则说明
//    * 匹配零个或多个字符与名称组件,不跨越目录
//    ** 匹配零个或多个字符与名称组件,跨越目录(含子目录)
//    ? 匹配一个字符的字符与名称组件
//     转义字符,例如{表示匹配左花括号
//    [] 匹配方括号表达式中的范围,连字符(-)可指定范围。例如[ABC]匹配"A"、"B"和"C";[a-z]匹配从"a"到"z";[abce-g]匹配"a"、"b"、"c"、"e"、"f"、"g";
//    [!...]匹配范围之外的字符与名称组件,例如[!a-c]匹配除"a"、"b"、"c"之外的任意字符
//    {}匹配组中的任意子模式,多个子模式用","分隔,不能嵌套。
//    正则规则regex
//    使用java.util.regex.Pattern支持的正则表达式。

    /**
     * 遍历目录 查找特定文件
     */
    public static List<String> listFiles(String dir, String glob) throws IOException {
        List<String> files = new ArrayList<>();
        PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(glob);

        Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {

            //            Path path = Paths.get("/usr/web/bbf.jar"); //endsWith 必须是路径中一段完整的
//            path.endsWith("bbf.jar");  // true
//            path.endsWith(".jar");     // false
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                if (pathMatcher.matches(file)) {
                    files.add(file.toString());
                    System.out.println(file);
                }
                return FileVisitResult.CONTINUE;
            }
        });
        return files;
    }

    @SneakyThrows
    public static List<String> lines(File file, String charset) {
        return org.apache.commons.io.FileUtils.readLines(file, charset);
    }

    @SneakyThrows
    public static void write(File file, String content, String charset) {
        org.apache.commons.io.FileUtils.write(file, content, charset);
    }

    @SneakyThrows
    public static void combineFiles(List<File> files, File outFile) {

        try (FileOutputStream fos = new FileOutputStream(outFile);
             FileChannel fosChannel = fos.getChannel();) {
            for (File file : files) {
                if (file.length() > 0) {
                    try (FileInputStream fis = new FileInputStream(file);
                         FileChannel fisChannel = fis.getChannel();) {
                        long size = fisChannel.size();
                        // 只能传输2G的数据
                        fisChannel.transferTo(0, size, fosChannel);
                        log.info("合并源:{}文件到:{}最终文件,最终文件合并前大小:{}.合并后大小:{},源文件大小:{}", file, outFile, size, fosChannel.size(), file.length()
                        );
                    }
                }
            }
        }

    }

    @SneakyThrows
    public static void copyFile(File tempFile, File file) {
        org.apache.commons.io.FileUtils.copyFile(tempFile, file);
    }
}

测试:

package com.util.file;

import org.junit.Test;

import java.io.File;

/**
 * @description: 大文件去重测试
 * @author: AlbertXe
 * @create: 2020-10-21 14:06
 */
public class BigFileDistinctUtilTest {

    @Test
    public void test() {
        File file = new File("d:/a.txt");
        BigFileDistinctUtil build = BigFileDistinctUtil.builder().function(t -> t).build();
        build.uniq(file);
    }

}
原文地址:https://www.cnblogs.com/albertXe/p/13859763.html