How to perform binary search for a text file

I have a large text file (5 MB) that I use in my Android application. I create the file as a list of pre-sorted lines, and the file does not change after it is created. How can I do a binary search in the contents of this file without reading line by line to find a suitable line?

+5
source share
5 answers

Since the contents of the file do not change, you can split the file into several parts. Say AG, HN, 0-T and UZ. This allows you to check the first character and immediately cut the possible set into a fourth of the original size. Now a linear search will not take so much time, or reading the entire file may be an option. This process can be expanded if n / 4 is still too large, but the idea is the same. Create search breakdowns in the file structure, rather than trying to do all of this in memory.

+5
source

The 5MB file is not that big - you have to read every line in the array String[], which you can then use java.util.Arrays.binarySearch()to find the line you need. This is my recommended approach.

, . , RandomAccessFile , seek(), ...

// open the file for reading
RandomAccessFile raf = new RandomAccessFile("myfile.txt","r");
String searchValue = "myline";
int lineSize = 50;
int numberOfLines = raf.length() / lineSize;

// perform the binary search...
byte[] lineBuffer = new byte[lineSize];
int bottom = 0;
int top = numberOfLines;
int middle;
while (bottom <= top){
  middle = (bottom+top)/2;
  raf.seek(middle*lineSize); // jump to this line in the file
  raf.read(lineBuffer); // read the line from the file
  String line = new String(lineBuffer); // convert the line to a String

  int comparison = line.compareTo(searchValue);
  if (comparison == 0){
    // found it
    break;
    }
  else if (comparison < 0){
    // line comes before searchValue
    bottom = middle + 1;
    }
  else {
    // line comes after searchValue
    top = middle - 1;
    }
  }

raf.close(); // close the file when you're finished

, , , , , .

+1

, , , , , . android, , , , -, (, , ). , .

( RandomAccessFile) "" ints - - , . .

( ), - . .

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.Set;

class StringFileSet {

    private static final double loadFactor = 0.75;

    public static void makeFile(String fileName, String comment, Set<String> set) throws IOException {
        new File(fileName).delete();
        RandomAccessFile fout = new RandomAccessFile(fileName, "rw");

        //Write comment
        fout.writeUTF(comment);

        //Make bucket array
        int numBuckets = (int)(set.size()/loadFactor);

        ArrayList<ArrayList<String>> bucketArray = new ArrayList<ArrayList<String>>(numBuckets);
        for (int ii = 0; ii < numBuckets; ii++){
            bucketArray.add(new ArrayList<String>());
        }

        for (String key : set){
            bucketArray.get(Math.abs(key.hashCode()%numBuckets)).add(key);
        }

        //Sort key lists in preparation for creating trees
        for (ArrayList<String> keyList : bucketArray){
            Collections.sort(keyList);
        }

        //Make queues in preparation for creating trees
        class NodeInfo{

            public final int lower;
            public final int upper;
            public final long callingOffset;

            public NodeInfo(int lower, int upper, long callingOffset){
                this.lower = lower;
                this.upper = upper;
                this.callingOffset = callingOffset;
            }

        }

        ArrayList<LinkedList<NodeInfo>> queueList = new ArrayList<LinkedList<NodeInfo>>(numBuckets);
        for (int ii = 0; ii < numBuckets; ii++){
            queueList.add(new LinkedList<NodeInfo>());
        }

        //Write bucket array
        fout.writeInt(numBuckets);
        for (int index = 0; index < numBuckets; index++){
            queueList.get(index).add(new NodeInfo(0, bucketArray.get(index).size()-1, fout.getFilePointer()));
            fout.writeInt(-1);
        }

        //Write trees
        for (int bucketIndex = 0; bucketIndex < numBuckets; bucketIndex++){
            while (queueList.get(bucketIndex).size() != 0){
                NodeInfo nodeInfo = queueList.get(bucketIndex).poll();
                if (nodeInfo.lower <= nodeInfo.upper){
                    //Set respective pointer in parent node
                    fout.seek(nodeInfo.callingOffset);
                    fout.writeInt((int)(fout.length() - (nodeInfo.callingOffset + 4))); //Distance instead of absolute position so that the get method can use a DataInputStream
                    fout.seek(fout.length());

                    int middle = (nodeInfo.lower + nodeInfo.upper)/2;

                    //Key
                    fout.writeUTF(bucketArray.get(bucketIndex).get(middle));

                    //Left child
                    queueList.get(bucketIndex).add(new NodeInfo(nodeInfo.lower, middle-1, fout.getFilePointer()));
                    fout.writeInt(-1);

                    //Right child
                    queueList.get(bucketIndex).add(new NodeInfo(middle+1, nodeInfo.upper, fout.getFilePointer()));
                    fout.writeInt(-1);
                }
            }
        }

        fout.close();
    }

    private final String fileName;
    private final int numBuckets;
    private final int bucketArrayOffset;

    public StringFileSet(String fileName) throws IOException {
        this.fileName = fileName;

        DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(fileName)));

        short numBytes = fin.readShort();
        fin.skipBytes(numBytes);
        this.numBuckets = fin.readInt();
        this.bucketArrayOffset = numBytes + 6;

        fin.close();
    }

    public boolean contains(String key) throws IOException {
        boolean containsKey = false;

        DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(this.fileName)));

        fin.skipBytes(4*(Math.abs(key.hashCode()%this.numBuckets)) + this.bucketArrayOffset);

        int distance = fin.readInt();
        while (distance != -1){
            fin.skipBytes(distance);

            String candidate = fin.readUTF();
            if (key.compareTo(candidate) < 0){
                distance = fin.readInt();
            }else if (key.compareTo(candidate) > 0){
                fin.skipBytes(4);
                distance = fin.readInt();
            }else{
                fin.skipBytes(8);
                containsKey = true;
                break;
            }
        }

        fin.close();

        return containsKey;
    }

}

import java.io.File;
import java.io.IOException;
import java.util.HashSet;

class Test {
    public static void main(String[] args) throws IOException {
        HashSet<String> stringMemorySet = new HashSet<String>();

        stringMemorySet.add("red");
        stringMemorySet.add("yellow");
        stringMemorySet.add("blue");

        StringFileSet.makeFile("stringSet", "Provided under ... included in all copies and derivatives ...", stringMemorySet);
        StringFileSet stringFileSet = new StringFileSet("stringSet");

        System.out.println("orange -> " + stringFileSet.contains("orange"));
        System.out.println("red -> " + stringFileSet.contains("red"));
        System.out.println("yellow -> " + stringFileSet.contains("yellow"));
        System.out.println("blue -> " + stringFileSet.contains("blue"));

        new File("stringSet").delete();

        System.out.println();
    }
}

, android, getResources().

, , Android , , -, - - - jpg. 100-300 .

, NDK.

+1

-, . : , . : 10 , 22 ( , , aaah 0, 4 ..). big endian ( java). , -.

word.dat:

aaahabasementableabnormalabnormalityabortionistabortion-rightsabracadabra

wordx.dat:

00 80 00 00 01 20 00 04 00 80 00 0D 01 00 00 11   _____ __________
01 60 00 19 01 60 00 24 01 E0 00 2F 01 60 00 3E   _`___`_$___/_`_>

#, ( txt , crlfs)

static void Main(string[] args)
{
    const string fIn = @"C:\projects\droid\WriteFiles\input\allwords.txt";
    const string fwordxOut = @"C:\projects\droid\WriteFiles\output\wordx.dat";
    const string fWordOut = @"C:\projects\droid\WriteFiles\output\word.dat";

    int i = 0;
    int offset = 0;
    int j = 0;
    var lines = File.ReadLines(fIn);

    FileStream stream = new FileStream(fwordxOut, FileMode.Create, FileAccess.ReadWrite);
    using (EndianBinaryWriter wwordxOut = new EndianBinaryWriter(EndianBitConverter.Big, stream))
    {
        using (StreamWriter wWordOut = new StreamWriter(File.Open(fWordOut, FileMode.Create)))
        {
            foreach (var line in lines)
            {
                wWordOut.Write(line);
                i = offset | ((int)line.Length << 22); //first 10 bits to the left is the word size
                offset = offset + (int)line.Length;
                wwordxOut.Write(i);
                //if (j == 7)
                  //  break;
                j++;
            }
        }
    }
}

Java :

public static void binarySearch() {
    String TAG = "TEST";
    String wordFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/word.dat";
    String wordxFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/wordx.dat";

    String target = "abracadabra"; 
    boolean targetFound = false; 
    int searchCount = 0; 

    try {
        RandomAccessFile raf = new RandomAccessFile(wordxFilePath, "r");
        RandomAccessFile rafWord = new RandomAccessFile(wordFilePath, "r");
        long low = 0;
        long high = (raf.length() / 4) - 1;
        int cur = 0;
        long wordOffset = 0;
        int len = 0;

        while (high >= low) {
            long mid = (low + high) / 2;
            raf.seek(mid * 4);
            cur = raf.readInt();
            Log.v(TAG + "-cur", String.valueOf(cur));

            len = cur >> 22; //word length

            cur = cur & 0x3FFFFF;  //first 10 bits are 0

            rafWord.seek(cur);
            byte [] bytes = new byte[len];

            wordOffset = rafWord.read(bytes, 0, len);
            Log.v(TAG + "-wordOffset", String.valueOf(wordOffset));

            searchCount++;

            String str = new String(bytes);

            Log.v(TAG, str);

            if (target.compareTo(str) < 0) {
                high = mid - 1;
            } else if (target.compareTo(str) == 0) {
                targetFound = true;
                break;
            } else {
                low = mid + 1;
            }
        }

        raf.close();
        rafWord.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    if (targetFound == true) {
        Log.v(TAG + "-found " , String.valueOf(searchCount));
    } else {
        Log.v(TAG + "-not found " , String.valueOf(searchCount));
    }

}
0

, , , . . .

0
source