Thursday, 3 May 2012

File Sorting - 2 Source 2 Destination


The problem with 2S - 1D is the explicit run distribution that occurs after each merge. It doubles the pressure on the disk queue. We can reduce this pressure drastically by allocating one more frame and designing the file sorting algorithm for 2 sources and 2 destinations (2S - 2D).

There will be four files F1, F2, F3 and F4. We have marked and distributed runs in F1 and F2. 

First take F1 & F2 as sources S1, S2 and F3 & F4 as destinations D1, D2 respectively. Take a run from S1 and a run from S2. Merge and place in D1 then switch the destination to D2. Take another run from S1 and S2, merge and place in D2. Again switch the destination. If end of file is encountered in S1 and S2 then D1 and D2 will become new sources and S1 and S2 will become new destination files. Hence the data movement will be from source to destination. Which pair of files will be source and which pair will be destination, will keep on changing as per time. 

At the end there will be one run on S1 and one run on S2. Merge and write on D1. As no data to write on D2, hence all the data is sorted.

The algorithm for the sub procedure for merging 2S-2D is as follows.
MergeRuns(boolean flag)
1. runCount=0;
2. if(flag==true)
 s1 = Reset(F1)
 s2 = Reset(F2)
 d1 = Rewrite(F3)
 d2 = Rewrite(F4)
   Else
 s1 = Reset(F3)
 s2 = Reset(F4)
 d1 = Rewrite(F1)
 d2 = Rewrite(F2)
   End if
3. e1 = Read(s1)
4. e2 = Read(s2)
5. d = d1;
6. While ( e1 != eof  and e2 !=eof)
 While (e1 != eor and e2 !=eor)
  If(e1<e2)
   Write(d, e1)
   e1 = Read(s1)
  Else
   Write(d, e2)
   e2 = Read(s2)
  End if
 End While

 If (e1 == eor)
  Copy remaining elements of the run from s2
 Else if (e2 == eor)
  Copy remaining elements of the run from s1
 End if
 Mark eor in d
 runCount++;
 
 If (d==d1)
  d=d2
 Else
  d=d1
 End if
   End While
7. If(e1==eof)
 While (e2 != eof)
  Write(e2,d)
  If(e2==eor) runCount++;
  Read(e2,s2)
 End While
   Else if(e2==eof)
 While (e1 != eof)
  Write(e1,d)
  If(e1==eor) runCount++;
  Read(e1,s1)
 End While
   End if
8. Mark EOF in d1
9. Mark EOF in d2
10.Return runCount


A program in C to sort file using 2S - 2D technique.

#include<stdio.h>
#include<conio.h>
#include<stdlib.h>

#define eor -1
#define eof -2

FILE *f1, *f2, *d1, *d2;

markRunsAndDistribute();
mergeRuns(int);


void main()
{
 int flag=0;
 markRunsAndDistribute();

 while(mergeRuns(flag)!=1)
  flag=1-flag;

 printf("File Sorted....");
}

markRunsAndDistribute()
{
 int runCount=0;
 int current, next;
 int switchFlag=1;


 if( (d1 = fopen("Source.txt", "r")) == NULL)
 {
  printf("Source file not found.");
  exit(1);
 }

 f1 = fopen("File1.txt","w");
 f2 = fopen("File2.txt", "w");

 fscanf(d1,"%d",¤t);

 while(1)
 {
  fscanf(d1, "%d", &next);

  if(next==eof)
  {
   if(switchFlag==1)
   {       fprintf(f1, "%d %d %d",current,eor,eof);
    fprintf(f2, "%d",eof);
    runCount++;
    break;
   }
   else
   {
    fprintf(f2, "%d %d %d",current,eor,eof);
    fprintf(f1, "%d",eof);
    runCount++;
    break;
   }
  }
  else
  {
   if(current>next)
   {
    if(switchFlag==1)
    {
     fprintf(f1, "%d %d ", current, eor);
     runCount++;
     switchFlag=2;
    }
    else
    {
     fprintf(f2, "%d %d ", current, eor);
     runCount++;
     switchFlag=1;
    }
   }
   else
   {
    if(switchFlag==1)
    {
        fprintf(f1, "%d ", current);
    }
    else
    {
        fprintf(f2, "%d ", current);
    }

   }
  }
  current = next;
 }

 fclose(d1);
 fclose(f1);
 fclose(f2);

 return runCount;
}

mergeRuns(int group)
{
 int switchD=0;
 int runCount=0;
 FILE *d;

 int e1, e2;

 if(group==0)
 {
  f1 = fopen("File1.txt","r");
  f2 = fopen("File2.txt","r");
  d1 = fopen("File3.txt","w");
  d2 = fopen("File4.txt","w");
 }
 else
 {
  f1 = fopen("File3.txt","r");
  f2 = fopen("File4.txt","r");
  d1 = fopen("File1.txt","w");
  d2 = fopen("File2.txt","w");
 }

 
 fscanf(f1,"%d",&e1);
 fscanf(f2,"%d",&e2);

 d=d1;

 while (e1!=eof && e2!=eof)
 {
  while(e1!=eor && e2!=eor)
  {
   if(e1<e2)
   {
    fprintf(d,"%d ",e1);
    fscanf(f1,"%d",&e1);
   }
   else
   {
    fprintf(d,"%d ",e2);
    fscanf(f2,"%d",&e2);
   }
  }

  if(e1==eor)
  {
   while (e2!=eor)
   {
    fprintf(d, "%d ", e2);
    fscanf(f2, "%d", &e2);
   }
  }
  else if(e2==eor)
  {
   while (e1!=eor)
   {
    fprintf(d, "%d ", e1);
    fscanf(f1, "%d", &e1);
   }
  }

  fprintf(d, "%d ", eor);
  runCount++;

  if(d==d1)
   d=d2;
  else
   d=d1;

  fscanf(f1,"%d",&e1);
  fscanf(f2,"%d",&e2);
  
 }
 if(e1==eof)
 {
  while (e2!=eof)
  {
   fprintf(d, "%d ", e2);
   if(e2==eor) runCount++;
   fscanf(f2, "%d", &e2);
  }
 }
 else if(e2==eof)
 {
  while (e1!=eof)
  {
   fprintf(d, "%d ", e1);
   if(e1==eor) runCount++;
   fscanf(f1, "%d", &e1);
  }
 }

 fprintf(d1,"%d",eof);
 fprintf(d2,"%d",eof);
 

 fclose(f1);
 fclose(f2);
 fclose(d1);
 fclose(d2);

 return runCount;
}

Advantage of 2S - 2D

Consider a file with 1K record. To fetch (read and write) a record 1 disk access is required and to distribute the record in two files 1K disk access.



Approximately half the number of disk accesses are saved in 2S – 2D. Hence by giving one extra frame the pressure on the disk queue is reduced drastically.

No comments:

Post a Comment

Your comments are very much valuable for us. Thanks for giving your precious time.