C++ 大规模数据排序(100G数据 使用 4G 内存 排序)

思路很简单,先分段排序,存储到临时文件中,然后合并.

使用10000个整数来模拟大数据,每次读取100个到内存中.

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>

enum
{
    enmMaxFileNameLen = 256,
};

void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName);
void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName);
void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp);
void RandomGenArrInt(int32_t arrInt[], int32_t arrSize);
void RandomGenData(int32_t numberCount);
void QSort(int32_t arrInt[], int32_t start, int32_t end);
void BigDataSort(const char *fileName, const int32_t maxNumberInMem);
int32_t Segment(const char *fileName, const int32_t maxNumberInMem);
int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut);
void PrintArrInt(int32_t arr[], int32_t arrSize);

int32_t main()
{
    RandomGenData(10000);
    BigDataSort("data_10000.txt", 100);
    getchar();
    return 0;
}

void RandomGenArrInt(int32_t arrInt[], int32_t arrSize)
{
    for (int32_t i = 0; i < arrSize; ++i)
    {
        arrInt[i] = i + 1;
    }
    for (int32_t i = 1; i <= arrSize; ++i)
    {
        int32_t m = rand() % arrSize;
        int32_t n = rand() % arrSize;
        int32_t tmp = arrInt[m];
        arrInt[m] = arrInt[n];
        arrInt[n] = tmp;
    }
}

void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName)
{
    FILE *fp = NULL;
    fopen_s(&fp, fileName, "w");
    if (!fp)
    {
        printf("open %s failed!
", fileName);
        return;
    }
    for (int32_t i = 0; i < arrSize; ++i)
    {
        fprintf_s(fp, "%d,", arrInt[i]);
    }
    fclose(fp);
    printf("save %s 
", fileName);
}

void RandomGenData(int32_t numberCount)
{
    int32_t *arr = new int32_t[numberCount];
    RandomGenArrInt(arr, numberCount);
    char fileName[enmMaxFileNameLen] = { 0 };
    sprintf_s(fileName, enmMaxFileNameLen,"data_%d.txt", numberCount);
    SaveArrToFile(arr, numberCount, fileName);
}

void QSort(int32_t arrInt[], int32_t start, int32_t end)
{
    if (start >= end){ return; }
    int32_t i = start, j = end;
    int32_t tmp = arrInt[i];
    while (i < j)
    {
        while (i < j && tmp < arrInt[j])
        {
            --j;
        }
        arrInt[i] = arrInt[j];
        while (i < j && tmp >= arrInt[i])
        {
            ++i;
        }
        arrInt[j] = arrInt[i];
    }
    arrInt[i] = tmp;
    QSort(arrInt, start, i - 1);
    QSort(arrInt, i + 1, end);
}

void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName)
{
    arrSize = 0;
    FILE *fp = NULL;
    fopen_s(&fp, fileName, "r");
    if (!fp)
    {
        printf("open %s failed!
", fileName);
        return;
    }
    while (arrSize < amxArrSize && !feof(fp))
    {
        fscanf_s(fp, "%d,", &arrInt[arrSize++]);
    }
}

void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp)
{
    arrSize = 0;
    while (arrSize < amxArrSize && !feof(fp))
    {
        fscanf_s(fp, "%d,", &arrInt[arrSize]);
        if (!feof(fp))
        {
            ++arrSize;
        }
    }
}

void BigDataSort(const char *fileName, const int32_t maxNumberInMem)
{
    int32_t segFileCount = Segment(fileName, maxNumberInMem);
    int32_t fileIndex = 1;
    char fileName1[enmMaxFileNameLen] = { 0 };
    char fileName2[enmMaxFileNameLen] = { 0 };
    char fileOut[enmMaxFileNameLen] = { 0 };
    while (true)
    {
        sprintf_s(fileName1, "%d.txt", fileIndex++);
        sprintf_s(fileName2, "%d.txt", fileIndex++);
        sprintf_s(fileOut, "%d.txt", segFileCount++);
        int32_t ret = MergeTwoFile(fileName1, fileName2, fileOut);
        if (ret != 0)
        {
            break;
        }
    }
}

int32_t Segment(const char *fileName, const int32_t maxNumberInMem)
{
    int32_t *arr = new int32_t[maxNumberInMem];
    FILE *fp = NULL;
    fopen_s(&fp, fileName, "r");
    if (!fp)
    {
        printf("open %s failed!
", fileName);
        return 0;
    }
    int32_t tmpFileIndex = 1;
    while (true)
    {
        int32_t arrSize = 0;
        ReadArrFromFilePtr(arr, arrSize, maxNumberInMem, fp);
        if (arrSize == 0)
        {
            break;
        }
        QSort(arr, 0, arrSize - 1);
        char tmpFileName[enmMaxFileNameLen] = { 0 };
        sprintf_s(tmpFileName, enmMaxFileNameLen, "%d.txt", tmpFileIndex++);
        SaveArrToFile(arr, arrSize, tmpFileName);
    }
    fclose(fp);
    delete[] arr;
    return tmpFileIndex;
}

int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut)
{
    int32_t ret = 1;
    FILE *fp1 = NULL, *fp2 = NULL, *fpOut = NULL;
    fopen_s(&fp1, fileName1, "r");
    fopen_s(&fp2, fileName2, "r");
    fopen_s(&fpOut, fileOut, "w");
    if (!fileOut)
    {
        printf("open %s failed!
", fileOut);
        return ret;
    }
    int32_t val1 = 0, val2 = 0;
    if (fp1){ fscanf_s(fp1, "%d,", &val1); }
    if (fp2){ fscanf_s(fp2, "%d,", &val2); }
    while (fp1 && fp2 && !feof(fp1) && !feof(fp2))
    {
        if (val1 < val2)
        {
            // printf("%d ", val1);
            fprintf_s(fpOut, "%d,", val1);
            fscanf_s(fp1, "%d,", &val1);
        }
        else
        {
            // printf("%d ", val2);
            fprintf_s(fpOut, "%d,", val2);
            fscanf_s(fp2, "%d,", &val2);
        }
        ret = 0;
    }
    while (fp1 && !feof(fp1))
    {
        // printf("%d ", val1);
        fprintf_s(fpOut, "%d,", val1);
        fscanf_s(fp1, "%d,", &val1);
    }
    while (fp2 && !feof(fp2))
    {
        // printf("%d ", val2);
        fprintf_s(fpOut, "%d,", val2);
        fscanf_s(fp2, "%d,", &val2);
    }
    if (fp1){ fclose(fp1); }
    if (fp2){ fclose(fp2); }
    fclose(fpOut);
    printf("save %s 
", fileOut);
    return ret;
}

void PrintArrInt(int32_t arr[], int32_t arrSize)
{
    for (int32_t i = 0; i < arrSize; ++i)
    {
        printf("%d ", arr[i]);
    }
}
原文地址:https://www.cnblogs.com/tangxin-blog/p/6814421.html