Indexing CSV data file in Solr – Using annotated java pojo’s

1. Java pojo:

Add the Java POJO with the required fields-

import org.apache.solr.client.solrj.beans.Field;

/**
 * Created by yash on 18/11/14.
 */
public class ProductBean {

    @Field
    private int id;

    @Field("rank")
    private int rank;

    @Field("prodid")
    private long prodid;

    @Field("cat")
    private int cat;

    @Field("subcat")
    private int subcat;

    public ProductBean(){} // Required by Solr to initialize bean.

    public ProductBean(int id, int rank, long prodid, int cat, int subcat)
    {
        this.id = id;
        this.rank = rank;
        this.prodid = prodid;
        this.cat = cat;
        this.subcat = subcat;
    }

    public int getRank() {
        return rank;
    }

    public void setRank(int rank) {
        this.rank = rank;
    }

    public long getprodid() {
        return prodid;
    }

    public void setprodid(long prodid) {
        this.prodid = prodid;
    }

    public int getCat() {
        return cat;
    }

    public void setCat(int cat) {
        this.cat = cat;
    }

    public int getSubcat() {
        return subcat;
    }

    public void setSubcat(int subcat) {
        this.subcat = subcat;
    }
}

2. Index the Data file in Solr

Using POJO for Indexing into Solr –

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Scanner;

/**
 * Created by yash on 6/11/14.
 */
@Service
public class SolrIndexerService {
    private static final Logger log = LoggerFactory.getLogger(SolrIndexerService.class);

    private static final String SOLR_URL = "http://54.254.192.149:8983/solr/feeddata/";
    private static final String FILE_PATH = "/home/yash/Desktop/solr-data/testdata.txt";

    public void indexFile() throws IOException, SolrServerException {
        SolrServer server = new CommonsHttpSolrServer(SOLR_URL);
        Scanner sc = new Scanner(new File(FILE_PATH));

        ProductBean bean;
        String record;
        String[] columns;
        int recordCount = 0;
        long currentTstmp = System.currentTimeMillis();

        while(sc.hasNextLine()) {
            record = sc.nextLine();
            if(record==null || record.length()<1){
                continue;
            }

            System.out.println(record);
            columns = record.split(",");

            bean =
                    new ProductBean(recordCount,
                            Integer.parseInt(columns[0]),
                            Long.parseLong(columns[1]),
                            Integer.parseInt(columns[2]),
                            Integer.parseInt(columns[3]),
                            Integer.parseInt(columns[4]));

            server.addBean(bean);
            recordCount++;
            if(recordCount%1000==0) server.commit();  // periodically flush
        }

        server.commit();

        /* Remove all records with updated time less than current updated timestamp */
        server.deleteByQuery("-last_updated:"+ String.valueOf(currentTstmp));
        server.commit();

        server.optimize();
        System.out.println("Done !!");
    }

    public static void main(String[] args) throws IOException, SolrServerException {
        new SolrIndexerService().indexFile();
    }

}

Yash Sharma is a Big Data & Machine Learning Engineer, A newbie OpenSource contributor, Plays guitar and enjoys teaching as part time hobby.
Talk to Yash about Distributed Systems and Data platform designs.

Leave a Reply

Your email address will not be published. Required fields are marked *