Now we will go through this file and mark runs. A run is a sorted subsequence of record.
1. Read 2 records
2. Sort those 2 records
3. Place them in F0
4. Mark eor (end of run)
5. Fetch Next two record and go to step 2
6. If no more records then stop
Now the file F0 will be
53, 76 | 5, 15 | 20, 29 | 31, 37| 40, 41| 49, 50| 61, 91| 76, 81| 1, 2| 3, 4| 5, 6|EOF
But we have not taken advantage of the sortedness of the file hence this primitive method of marking run is not useful.
2. Marking Natural Runs
1. Reset(S)
2. Rewrite(F0)
3. Read record from S s.current
4. Place in F0
5. Read next record from S as s.next
6. If no record then go to step 8
7. If s.current<s.next Then
Place s.next in F0
Go to step 3
Else
Mark eor in F0
Place s.next in F0
Go to step 3
End if
8. Mark eor in F0
9. Mark eof in F0
10.Stop
Now the file F0 will be
76 | 53 | 5, 15, 20, 29, 31, 37, 41 | 40, 50 | 49, 91 | 61, 81 | 76 | 1, 2, 3, 4, 5, 6 | EOF
<A procedure in C for marking natural runs>
markRuns()
{
int runCount=0;
int current, next;
if( (d = fopen("Source.txt", "r")) == NULL)
{
printf("Source file not found.");
exit(1);
}
f1 = fopen("File0.txt","w");
fscanf(d,"%d",¤t);
while(1)
{
fscanf(d, "%d", &next);
if(next==eof)
{
fprintf(f1, "%d %d %d",current,eor,eof);
runCount++;
break;
}
else
{
if(current>next)
{
fprintf(f1, "%d %d ", current, eor);
runCount++;
}
else
{
fprintf(f1, "%d ", current);
}
}
current = next;
}
fclose(d);
fclose(f1);
fclose(f2);
return runCount;
}
As the data is naturally sorted, hence these are called
“Natural Runs”. The runs will be
long, fewer and variable in length. This will have fewer passes. A pass is a run through entire data from 1st till last record.
Distributing Runs in two files F1 and F2
This is a very simple method. A flag is uses to switch between files F1 and F2.
1. Set flag=true
2. Reset(F0)
3. Rewrite(F1)
4. Rewrite(F2)
5. Read an element from F0
6. If element is EOF go to step 8
7. If flag is true
Place the element in F1
If the element is eor
flag=false
Else
Place the element in F2
If the element is eor
flag=true
End if
8. Mark EOF in F1 and F2
9. Stop
Now the contents of the files F1 and F2 will be as follows.
Mark and Distribute as one procedure
We have passed through the source file twice, 1st time to mark runs and 2nd time to distribute runs. These were explicit mark & distribute operations using two separate sub routines. This can be combined into one subroutine Mark & Distribute. The subroutine for primitive run marking and distribution is as follows. Flag is used to switch between files F1 and F2.
1. Reset(Source)
2. Rewrite(F1)
3. Rewrite(F2)
4. do
Read frame full of data
Sort internally using any array sorting algorithm
If Flag=true
Copy in F1
Flag = flase
Else
Copy in F2
Flag = true
End if
Mark eor
Till you reach end of source
5. Stop
Algorithm of subroutine for marking and distributing natural runs is as follows. When we mark end of run, we switch the file and place element. This means that two things will be done at the same time. 1. Marking of runs and 2. Distribution of runs.
1. Reset(Source)
2. Rewrite(F1)
3. Rewrite(F2)
4. Flag=true
5. s.current = Read(source)
6. While (true)
If(flag==true)
f=F1;
Else
f=F2
End if
s.next = Read(source)
If s.next==eof
Write(s.current,f)
Mark eor in f
break
Else
If(s.current>s.next)
Write(s.current, f)
Mark(eor, f)
flag=!(flag)
Else
Write(s.current, f)
End if
End if
s.current=s.next
End While
7. Write(F1, eof)
8. Write(F2, eof)
9. Stop
<A sub procedure in C for marking and distributing natural runs>
markRunsAndDistribute()
{
int runCount=0;
int current, next;
int switchFlag=1;
if( (d = fopen("File0.txt", "r")) == NULL)
{
printf("Source file not found.");
exit(1);
}
f1 = fopen("File1.txt","w");
f2 = fopen("File2.txt", "w");
fscanf(d,"%d",¤t);
while(1)
{
fscanf(d, "%d", &next);
if(next==eof)
{
if(switchFlag==1)
{ fprintf(f1, "%d %d %d",current,eor,eof);
fprintf(f2, "%d",eof);
runCount++;
break;
}
else
{
fprintf(f2, "%d %d %d",current,eor,eof);
fprintf(f1, "%d",eof);
runCount++;
break;
}
}
else
{
if(current>next)
{
if(switchFlag==1)
{
fprintf(f1, "%d %d ", current, eor);
runCount++;
switchFlag=2;
}
else
{
fprintf(f2, "%d %d ", current, eor);
runCount++;
switchFlag=1;
}
}
else
{
if(switchFlag==1)
{
fprintf(f1, "%d ", current);
}
else
{
fprintf(f2, "%d ", current);
}
}
}
current = next;
}
fclose(d);
fclose(f1);
fclose(f2);
return runCount;
}
Merging Runs
After marking runs in the raw data and distributing it into two files, the next step is to merge them and put them back into a file. We pick an element from F1 and F2 each, place the smaller in F0 and advance in the file of which element is placed. If eor is encountered in a file, then copy the remaining elements from the run of the other file. E.g we are having files F1 and F2 as follows.
F1: 76 | 5, 15, 20, 29, 31, 37, 41 | 49, 91 | 76 | EOF
F2: 53 | 40, 50 | 61, 81 | 1, 2, 3, 4, 5, 6 | EOF
Compare 76 and 53; 53<76;
write 53 in F0;
advance in F2;
eor encountered;
Copy 56 in F0;
Mark eor;
Take next elements 5 and 40;
5<40;
write 5 in F0;
advance in F1;
15<40;
write 15 in F0;
and so on till eof in any of the file is encountered. After that copy the remaining runs from the other file as it is in F0.
Finally F0 will be having following data.
F0: 53, 76 | 5, 15, 20, 29, 31, 37, 40, 41, 50 | 49, 61, 81, 91 | 1, 2, 3, 4, 5, 6, 76 | EOF
The algorithm for merging runs from files F1 and F2 and placing them in F0 is as follows.
1. runCount=0;
2. Rewrite(F0)
3. Reset(F1)
4. Reset(F2)
5. Read(e1, F1)
6. Read(e2, F2)
7. While (e1 != eof and e2 != eof)
While (e1 != eor and e2 != eor)
If (e1<e2)
Write(e1, F0)
Read(e1, F1)
Else
Write(e2, F0)
Read(e2, F2)
End if
End While
If (e1==eor)
Copy remaining elements of the run from F2 in F0
Else
Copy remaining elements of the run from F1 in F0
End if
Write(eor, F0)
Runcount++
Read(e1, F1)
Read(e2, F2)
8. End While
9. If(e1==eof)
While (e2 != eof)
Write(e2,F0)
If(e2==eor) runCount++;
Read(e2,F2)
End While
Else if(e2==eof)
While (e1 != eof)
Write(e1,F0)
If(e1==eor) runCount++;
Read(e1,F1)
End While
End if
10. Mark EOF in S0
11. Return runCount
<A sub procedure in C for merging runs>
mergeRuns()
{
int e1,e2;
int runCount=0;
f1 = fopen("file1.txt","r");
f2 = fopen("file2.txt","r");
d = fopen("File0.txt","w");
fscanf(f1,"%d",&e1);
fscanf(f2,"%d",&e2);
while (e1!=eof && e2!=eof)
{
while(e1!=eor && e2!=eor)
{
if(e1<e2)
{
fprintf(d,"%d ",e1);
fscanf(f1,"%d",&e1);
}
else
{
fprintf(d,"%d ",e2);
fscanf(f2,"%d",&e2);
}
}
if(e1==eor)
{
while (e2!=eor)
{
fprintf(d, "%d ", e2);
fscanf(f2, "%d", &e2);
}
}
else if(e2==eor)
{
while (e1!=eor)
{
fprintf(d, "%d ", e1);
fscanf(f1, "%d", &e1);
}
}
fprintf(d, "%d ", eor);
runCount++;
fscanf(f1,"%d",&e1);
fscanf(f2,"%d",&e2);
}
if(e1==eof)
{
while (e2!=eof)
{
fprintf(d, "%d ", e2);
if(e2==eor) runCount++;
fscanf(f2, "%d", &e2);
}
}
else if(e2==eof)
{
while (e1!=eof)
{
fprintf(d, "%d ", e1);
if(e1==eor) runCount++;
fscanf(f1, "%d", &e1);
}
}
fprintf(d,"%d",eof);
fclose(f1);
fclose(f2);
fclose(d);
return runCount;
}
This process of merge and distribute continues till the number of runs in file F0 become 1. At this time the data is completely sorted.
I
f we start with two files each having runs = N; then
after 1 merge and distribute each file will have N/2 runs.
After another merge and distribute each file will have N/4 runs.
This continues till the run count becomes 1.
Hence for N runs the number of passes,
P = log2N
Similarly if we know the passes P then number of runs N will be
2P-1 < N <= 2P.
Finally, a complete program in C to demonstrate file sorting using two source and one destination.
#include<stdio.h>
#include<conio.h>
#include<stdlib.h>
#define eor -1
#define eof -2
FILE *f1, *f2, *d;
markRuns();
mergeRuns();
void distributeRuns();
void main()
{
markRuns();
distributeRuns();
while(mergeRuns()!=1)
{
distributeRuns();
}
printf("File Sorted....");
getch();
}
markRuns()
{
int runCount=0;
int current, next;
if( (d = fopen("Source.txt", "r")) == NULL)
{
printf("Source file not found.");
exit(1);
}
f1 = fopen("File0.txt","w");
fscanf(d,"%d",¤t);
while(1)
{
fscanf(d, "%d", &next);
if(next==eof)
{
fprintf(f1, "%d %d %d",current,eor,eof);
runCount++;
break;
}
else
{
if(current>next)
{
fprintf(f1, "%d %d ", current, eor);
runCount++;
}
else
{
fprintf(f1, "%d ", current);
}
}
current = next;
}
fclose(d);
fclose(f1);
fclose(f2);
return runCount;
}
void distributeRuns()
{
int switchFlag=1;
int e;
f1 = fopen("File1.txt", "w");
f2 = fopen("File2.txt", "w");
d = fopen("File0.txt", "r");
fscanf(d,"%d",&e);
while (e!=eof)
{
if(switchFlag==1)
{
fprintf(f1, "%d ", e);
if(e==eor) switchFlag=2;
}
else
{
fprintf(f2, "%d ", e);
if(e==eor) switchFlag=1;
}
fscanf(d,"%d",&e);
}
fprintf(f1, "%d",eof);
fprintf(f2, "%d",eof);
fclose(d);
fclose(f1);
fclose(f2);
}
mergeRuns()
{
int e1,e2;
int runCount=0;
f1 = fopen("file1.txt","r");
f2 = fopen("file2.txt","r");
d = fopen("File0.txt","w");
fscanf(f1,"%d",&e1);
fscanf(f2,"%d",&e2);
while (e1!=eof && e2!=eof)
{
while(e1!=eor && e2!=eor)
{
if(e1<e2)
{
fprintf(d,"%d ",e1);
fscanf(f1,"%d",&e1);
}
else
{
fprintf(d,"%d ",e2);
fscanf(f2,"%d",&e2);
}
}
if(e1==eor)
{
while (e2!=eor)
{
fprintf(d, "%d ", e2);
fscanf(f2, "%d", &e2);
}
}
else if(e2==eor)
{
while (e1!=eor)
{
fprintf(d, "%d ", e1);
fscanf(f1, "%d", &e1);
}
}
fprintf(d, "%d ", eor);
runCount++;
fscanf(f1,"%d",&e1);
fscanf(f2,"%d",&e2);
}
if(e1==eof)
{
while (e2!=eof)
{
fprintf(d, "%d ", e2);
if(e2==eor) runCount++;
fscanf(f2, "%d", &e2);
}
}
else if(e2==eof)
{
while (e1!=eof)
{
fprintf(d, "%d ", e1);
if(e1==eor) runCount++;
fscanf(f1, "%d", &e1);
}
}
fprintf(d,"%d",eof);
fclose(f1);
fclose(f2);
fclose(d);
return runCount;
}