Top n items in the list (including duplicates)

Trying to find an effective way to get the top N items in a very large list, possibly containing duplicates.

First I tried sorting and slicing, which works. But that seems unnecessary. You don’t need to sort a very large list if you just need the top 20 members. So I wrote a recursive procedure that builds a top-n list. This also works, but much slower than non-recursive!

Question: What is my second subtask (elite2) so slower than the elite, and how do I make it faster? My code is below. Thank.

import scala.collection.SeqView
import scala.math.min
object X {

    def  elite(s: SeqView[Int, List[Int]], k:Int):List[Int] = {
        s.sorted.reverse.force.slice(0,min(k,s.size))
    }

    def elite2(s: SeqView[Int, List[Int]], k:Int, s2:List[Int]=Nil):List[Int] = {
        if( k == 0 || s.size == 0) s2.reverse
        else {
            val m = s.max
            val parts = s.force.partition(_==m)
            val whole = if( parts._1.size > 1) parts._1.tail:::parts._2 else parts._2
            elite2( whole.view, k-1, m::s2 )
        }
    }

    def main(args:Array[String]) = {
        val N = 1000000/3
        val x = List(N to 1 by -1).flatten.map(x=>List(x,x,x)).flatten.view
        println(elite2(x,20))
        println(elite(x,20))
    }
}
+3
source share
5 answers

- , 20 , ? 20 , , 20, . :

  def topNs(xs: TraversableOnce[Int], n: Int) = {
    var ss = List[Int]()
    var min = Int.MaxValue
    var len = 0
    xs foreach { e =>
      if (len < n || e > min) {
        ss = (e :: ss).sorted
        min = ss.head
        len += 1
      }
      if (len > n) {
        ss = ss.tail
        min = ss.head
        len -= 1
      }                    
    }
    ss
  }  

(, SortedSet, , .)

100- , 40 . elite 850 , elite2 4100 . , 20% , .

+3

QuickSelect. QuickSort, , , O (n).

+4

, log(M), M. , , log(M) 30. , . , ( ), , () ( takeRight)

val arr = s.toArray
java.util.Arrays.sort(arr)
arr.takeRight(N).toList

, , . quicksort, , quicksort (, , O(n^2)!). N (), O(log N) , O(N/4) - , N . (, -, ), .

, .

( , , , , s.map(x => /* convert element to corresponding number*/).toArray, , , , , , , .)

+4

, :

selectLargest(n: Int, xs: List): List
  if size(xs) <= n
     return xs
  pivot <- selectPivot(xs)
  (lt, gt) <- partition(xs, pivot)
  if size(gt) == n
     return gt
  if size(gt) < n
     return append(gt, selectLargest(n - size(gt), lt))
  if size(gt) > n
     return selectLargest(n, gt)

selectPivot "" . partition : lt (, , ) gt ( , ). , , , . , , .

Scala.

0

, , . , , , ? :

    import util.Sorting.quickSort

    class TopNSet[T](n:Int) (implicit ev: Ordering[T], ev2: ClassManifest[T]){
      val ss = new Array[T](n)
      var len = 0

      def tryElement(el:T) = {
        if(len < n-1){
          ss(len) = el
          len += 1
        }
         else if(len == n-1){
          ss(len) = el
          len = n
          quickSort(ss)
        }
        else if(ev.gt(el, ss(0))){
          ss(0) = el
          quickSort(ss)
        }
      }
      def getTop() = {
        ss.slice(0,len)
      }
    }

Rating compared to accepted answer:

val myInts = Array.fill(100000000)(util.Random.nextInt)
time(topNs(myInts,100)
//Elapsed time 3006.05485 msecs
val myTopSet = new TopNSet[In](100)
time(myInts.foreach(myTopSet.tryElement(_)))
//Elapsed time 4334.888546 msecs

So, not much slower and certainly much more flexible

0
source

All Articles