How to add line numbers for lines in PIG or HIVE?

I have a problem adding line numbers using Apache Pig. The problem is that I have a STR_ID column and I want to add a ROW_NUM column for the data in STR_ID, which is the row number STR_ID.

For example, here is the input:

STR_ID
------------
3D64B18BC842
BAECEFA8EFB6
346B13E4E240
6D8A9D0249B4
9FD024AA52BA

How to get a conclusion, for example:

   STR_ID    |   ROW_NUM
----------------------------
3D64B18BC842 |     1
BAECEFA8EFB6 |     2
346B13E4E240 |     3
6D8A9D0249B4 |     4
9FD024AA52BA |     5

Answers using Pig or Hive are acceptable. Thank.

+5
source share
8 answers

Facebook sent the number of UDFs with the hive, including NumberRows. Depending on your version of the hive (I think 0.8), you may need to add an attribute to the class (stateful = true).

+3
source

In the hive:

Query

select str_id,row_number() over() from tabledata;

Output

3D64B18BC842      1
BAECEFA8EFB6      2
346B13E4E240      3
6D8A9D0249B4      4
9FD024AA52BA      5
+4
source

0.11 RANK, .

+2

, Pig, ( ) UDF. . :

import java.io.IOException;
import java.util.Iterator;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.data.DataType;

public class RowCounter extends EvalFunc<DataBag> {
TupleFactory mTupleFactory = TupleFactory.getInstance();
BagFactory mBagFactory = BagFactory.getInstance();
public DataBag exec(Tuple input) throws IOException {
    try {
        DataBag output = mBagFactory.newDefaultBag();
        DataBag bg = (DataBag)input.get(0);
        Iterator it = bg.iterator();
        Integer count = new Integer(1);
        while(it.hasNext())
            { Tuple t = (Tuple)it.next();
              t.append(count);
              output.add(t);
              count = count + 1;
            }

        return output;
    } catch (ExecException ee) {
        // error handling goes here
        throw ee;
    }
}
public Schema outputSchema(Schema input) {
     try{
         Schema bagSchema = new Schema();
         bagSchema.add(new Schema.FieldSchema(null, DataType.BAG));

         return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input),
                                                bagSchema, DataType.BAG));
     }catch (Exception e){
        return null;
     }
    }
}

. .

+1

1. row_sequence() ID

add jar /Users/trongtran/research/hadoop/dev/hive-0.9.0-bin/lib/hive-contrib-0.9.0.jar;
drop temporary function row_sequence;
create temporary function row_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';

2. STR

INSERT OVERWRITE TABLE new_table
SELECT 
    row_sequence(),
    STR_ID
FROM old_table;
+1

-

select *
  ,rank() over (rand()) as row_num
  from table

, , STR_ID -

select *
  ,rank() over (STR_ID,rank()) as row_num
  from table
+1

:

select
str_id, ROW_NUMBER() OVER() as row_num 
from myTable;
+1

All Articles