如何加快在Java中的外部归并排序Java

2023-09-11 23:18:08 作者:静待花开

我写code外部合并排序。这个想法是,输入文件包含太多的数字存储在一个数组,所以你读一些,并把它变成文件存储。这是我的code。虽然它运行快,不够快。我在想,如果你能想到的任何改进,我可以在code。需要注意的是,在第一次,我有点每1米整数在一起,所以我跳过合并算法的迭代。

 进口java.io.BufferedInputStream中;
进口java.io.BufferedOutputStream;
进口java.io.DataInputStream中;
进口java.io.DataOutputStream中;
进口java.io.FileInputStream中;
进口java.io.FileNotFoundException;
进口java.io.IOException异常;
进口java.io.RandomAccessFile中;
进口java.security.DigestInputStream中;
进口java.security.MessageDigest中;
进口java.security.NoSuchAlgorithmException;
进口java.util.Arrays中;

公共类ExternalSort {

    公共静态无效排序(F1字符串,字符串F2)抛出异常{
        RandomAccessFile的RAF1 =新的RandomAccessFile(F1,RW);
        RandomAccessFile的raf2和=新的RandomAccessFile(F2,RW);
        INT fileByteSize =(int)的(raf1.length()/ 4);
        INT大小= Math.min(1000000,fileByteSize);
        externalSort(F1,F2,大小);
        布尔writeToOriginal =真;
        DataOutputStream类之处;
        而(大小与LT = fileByteSize){
            如果(writeToOriginal){
                raf1.seek(0);
                DOS =新DataOutputStream类(新的BufferedOutputStream(
                        新MyFileOutputStream(raf1.getFD())));
            } 其他 {
                raf2.seek(0);
                DOS =新DataOutputStream类(新的BufferedOutputStream(
                        新MyFileOutputStream(raf2.getFD())));
            }
            的for(int i = 0; I< fileByteSize;我+ = 2 *大小){
                如果(writeToOriginal){
                    DOS =合并(F2,DOS,我,尺寸);
                } 其他 {
                    DOS =合并(F1,DOS,我,尺寸);
                }
            }
            dos.flush();
            writeToOriginal = writeToOriginal!;
            尺寸* = 2;
        }
        如果(writeToOriginal)
        {
            raf1.seek(0);
            raf2.seek(0);
            DOS =新DataOutputStream类(新的BufferedOutputStream(
                    新MyFileOutputStream(raf1.getFD())));
            INT I = 0;
            而(ⅰ&其中; raf2.length()/ 4){
                dos.writeInt(raf2.readInt());
                我++;
            }
            dos.flush();
        }
    }

    公共静态无效externalSort(字符串F1,F2的字符串,诠释大小)抛出异常{

        RandomAccessFile的RAF1 =新的RandomAccessFile(F1,RW);
        RandomAccessFile的raf2和=新的RandomAccessFile(F2,RW);

        INT fileByteSize =(int)的(raf1.length()/ 4);

        INT []数组=新INT [尺寸]
        的DataInputStream解散=新的DataInputStream(新的BufferedInputStream(
                新MyFileInputStream(raf1.getFD())));
        DataOutputStream类DOS =新DataOutputStream类(新的BufferedOutputStream(
                新MyFileOutputStream(raf2.getFD())));

        诠释计数= 0;
        而(计数< fileByteSize){
            对于(INT K = 0; K<大小; ++ K){
                阵[K] = dis.readInt();
            }
            数+ =大小;
            Arrays.sort(阵列);
            对于(INT K = 0; K<大小; ++ K){
                dos.writeInt(阵[K]);
            }
        }
        dos.flush();
        raf1.close();
        raf2.close();
        透露();
        dos.close();
    }

    公共静态DataOutputStream类合并(字符串的文件,
            DataOutputStream类之处,诠释开始,诠释大小)抛出IOException异常{
        RandomAccessFile的英国皇家空军=新RandomAccessFile的(文件,RW);
        RandomAccessFile的raf2和=新RandomAccessFile的(文件,RW);

        INT fileByteSize =(int)的(raf.length()/ 4);
        raf.seek(4 *启动);
        raf2.seek(4 *启动);
        的DataInputStream解散=新的DataInputStream(新的BufferedInputStream(
                新MyFileInputStream(raf.getFD())));
        的DataInputStream DIS3 =新的DataInputStream(新的BufferedInputStream(
                新MyFileInputStream(raf2.getFD())));
        INT I = 0;
        INT J = 0;
        INT最大=尺寸* 2;

        INT一个= dis.readInt();

        INT B:
        如果(启动+尺寸与LT; fileByteSize){
            dis3.skip(4 *大小);
            B = dis3.readInt();
        } 其他 {
            B = Integer.MAX_VALUE的;
            J =大小;
        }
        而(I + J<最大){
            如果(j == ||尺寸(A< = B和;&安培;!我=大小)){
                dos.writeInt(一);
                我++;
                如果(启动+ I == fileByteSize){
                    我=大小;
                }否则,如果(我!=大小){
                    一个= dis.readInt();
                }
            } 其他 {
                dos.writeInt(B);
                J ++;
                如果(启动+规模+ J == fileByteSize){
                    J =大小;
                }否则,如果(j!=大小){

                    B = dis3.readInt();
                }
            }
        }
        raf.close();
        raf2.close();
        返回到之处;
    }

    公共静态无效的主要(字串[] args)抛出异常{
        字符串F1 =的args [0];
        串f2的=的args [1];

        排序(F1,F2);

     }
}
 

解决方案

您可能希望在同一时间合并K> 2段。这减少了I / O的量的n log K /日志2到n日志N / 1o9氏。

编辑:在伪code,这将是这个样子:

 无效排序(名单列表){
    如果(表装入内存){
        list.sort();
    } 其他 {
        子列表=分区列表为k约一样大的子列表
        对于(子列表:子列表){
            排序(子表);
        }
        合并(子列表);
    }
}

无效合并(名单[] sortedsublists){
    保持一个指针,在每个子列表,它最初指向它的第一个元素
    做 {
        找到指针指着最小的元素
        加它指向到结果列表中的元件
        推进这一指针
    },直到所有的指针已达到其子列表的末尾
    返回结果列表
}
 
Java开发也需要懂算法吗

要有效地找到了最小的指针,你可能会采用的PriorityQueue

I am writing code for the external merge sort. The idea is that the input files contain too many numbers to be stored in an array so you read some of it and put it into files to be stored. Here's my code. While it runs fast, it is not fast enough. I was wondering if you can think of any improvements I can make on the code. Note that at first, I sort every 1m integers together so I skip iterations of the merging algorithm.

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;

public class ExternalSort {

    public static void sort(String f1, String f2) throws Exception {
        RandomAccessFile raf1 = new RandomAccessFile(f1, "rw");
        RandomAccessFile raf2 = new RandomAccessFile(f2, "rw");
        int fileByteSize = (int) (raf1.length() / 4);
        int size = Math.min(1000000, fileByteSize);
        externalSort(f1, f2, size);  
        boolean writeToOriginal = true;
        DataOutputStream dos;
        while (size <= fileByteSize) {
            if (writeToOriginal) {
                raf1.seek(0);
                dos = new DataOutputStream(new BufferedOutputStream(
                        new MyFileOutputStream(raf1.getFD())));
            } else {
                raf2.seek(0);
                dos = new DataOutputStream(new BufferedOutputStream(
                        new MyFileOutputStream(raf2.getFD())));
            }
            for (int i = 0; i < fileByteSize; i += 2 * size) {
                if (writeToOriginal) {
                    dos = merge(f2, dos, i, size);
                } else {
                    dos = merge(f1, dos, i, size);
                }
            }
            dos.flush();
            writeToOriginal = !writeToOriginal;
            size *= 2;
        }
        if (writeToOriginal)
        {
            raf1.seek(0);
            raf2.seek(0);
            dos = new DataOutputStream(new BufferedOutputStream(
                    new MyFileOutputStream(raf1.getFD())));
            int i = 0;
            while (i < raf2.length() / 4){
                dos.writeInt(raf2.readInt());
                i++;
            }   
            dos.flush();
        }
    }

    public static void externalSort(String f1, String f2, int size) throws Exception{

        RandomAccessFile raf1 = new RandomAccessFile(f1, "rw");
        RandomAccessFile raf2 = new RandomAccessFile(f2, "rw");

        int fileByteSize = (int) (raf1.length() / 4);

        int[] array = new int[size];
        DataInputStream dis = new DataInputStream(new BufferedInputStream(
                new MyFileInputStream(raf1.getFD())));
        DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(
                new MyFileOutputStream(raf2.getFD())));

        int count = 0;
        while (count < fileByteSize){
            for (int k = 0; k < size; ++k){
                array[k] = dis.readInt();
            }
            count += size;
            Arrays.sort(array);
            for (int k = 0; k < size; ++k){
                dos.writeInt(array[k]);
            }       
        }
        dos.flush();
        raf1.close();
        raf2.close();
        dis.close();
        dos.close();
    }

    public static DataOutputStream merge(String file,
            DataOutputStream dos, int start, int size) throws IOException {
        RandomAccessFile raf = new RandomAccessFile(file, "rw");
        RandomAccessFile raf2 = new RandomAccessFile(file, "rw");

        int fileByteSize = (int) (raf.length() / 4);
        raf.seek(4 * start);
        raf2.seek(4 *start);
        DataInputStream dis = new DataInputStream(new BufferedInputStream(
                new MyFileInputStream(raf.getFD())));
        DataInputStream dis3 = new DataInputStream(new BufferedInputStream(
                new MyFileInputStream(raf2.getFD())));
        int i = 0;
        int j = 0;
        int max = size * 2;

        int a = dis.readInt();

        int b;
        if (start + size < fileByteSize) {
            dis3.skip(4 * size);
            b = dis3.readInt();
        } else {
            b = Integer.MAX_VALUE;
            j = size;
        }
        while (i + j < max) {
            if (j == size || (a <= b && i != size)) {
                dos.writeInt(a);
                i++;
                if (start + i == fileByteSize) {
                    i = size;
                } else if (i != size) {
                    a = dis.readInt();
                }
            } else {
                dos.writeInt(b);
                j++;
                if (start + size + j == fileByteSize) {
                    j = size;
                } else if (j != size) { 

                    b = dis3.readInt();
                }
            }
        }
        raf.close();
        raf2.close();
        return dos;
    }

    public static void main(String[] args) throws Exception {
        String f1 = args[0];
        String f2 = args[1];

        sort(f1, f2);

     }
}

解决方案

You might wish to merge k>2 segments at a time. This reduces the amount of I/O from n log k / log 2 to n log n / log k.

Edit: In pseudocode, this would look something like this:

void sort(List list) {
    if (list fits in memory) {
        list.sort();
    } else {
        sublists = partition list into k about equally big sublists
        for (sublist : sublists) {
            sort(sublist);
        }
        merge(sublists);
    }
}

void merge(List[] sortedsublists) {
    keep a pointer in each sublist, which initially points to its first element
    do {
        find the pointer pointing at the smallest element
        add the element it points to to the result list
        advance that pointer
    } until all pointers have reached the end of their sublist
    return the result list
}

To efficiently find the "smallest" pointer, you might employ a PriorityQueue.