Quantcast
Channel: Yudiz Solutions Ltd.
Viewing all articles
Browse latest Browse all 595

Pagination Data Scraping in Android using Jsoup(Java HTML Parser)

$
0
0

Overview

Jsoup iterate all elements of HTML illustration and demonstrates to choose & repeat all elements of HTML document utilizing Jsoup.
Jsoup provides select technique which acknowledges CSS style selectors to choose the elements.
Click here for start with basic of Data Scraping in Android using JSOUP
Now we will be scraping all the data from the pagination of the blogger page of Yudiz. And we will display it in RecyclerView.

Steps :-

  1. First of all we need to find the total number of the pages available in blog page.
  2. We store all the page url in an ArrayList.
  3. Now we will connect with each and every url and get all needed data from it.

Step 1 : HTML Source Code

We will use http://www.yudiz.com/blog/ for a data scraping of this webpage.

pagination-image1

Total Number of pages HTML Code:-

<div class="pages">
<a href="http://www.yudiz.com/blog/" class="page active">1</a>
<a href="http://www.yudiz.com/blog/page/2/" class="page">2</a>
<a href="http://www.yudiz.com/blog/page/3/" class="page">3</a>
<a href="http://www.yudiz.com/blog/page/4/" class="page">4</a>
<a href="http://www.yudiz.com/blog/page/5/" class="page">5</a>
<a href="http://www.yudiz.com/blog/page/6/" class="page">6</a>
<a href="http://www.yudiz.com/blog/page/7/" class="page">7</a>
<a href="http://www.yudiz.com/blog/page/8/" class="page">8</a>
</div>

Author Name HTML Code:-

<span class="vcard author post-author test">
<a href="http://www.yudiz.com/author/sandeep-joshi/">
Sandeep Joshi
</a>
</span>

Blog Upload Date HTML Code:-

<span class="post-date updated">November 24, 2017</span>

Blog Title HTML Code:-

<div class="post-title">
<h2 class="entry-title" itemprop="headline">
<a href="http://www.yudiz.com/how-to-customize-your-app-icon/">
How to customize your app icon?
</a>
</h2>
</div>

Note:- For Scraping you must have to find the unique HTML element tag for necessary field otherwise you should have to find some other HTML element if the same HTML element is used for other purposes.

pagination-image2

pagination-image3

Step 2 : Android Source Code

Permissions to be needed in Manifest.xml :-

<uses-permission android:name="android.permission.INTERNET" />

Gradle Dependencies to be add :-

dependencies {
   implementation 'org.jsoup:jsoup:1.11.2'
}

activity_main.xml

<?xml version="1.0" encoding="utf-8"?>
<android.support.constraint.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
   xmlns:app="http://schemas.android.com/apk/res-auto"
   xmlns:tools="http://schemas.android.com/tools"
   android:layout_width="match_parent"
   android:layout_height="match_parent"
   tools:context="com.jsoupdemo.MainActivity">

   <android.support.v7.widget.RecyclerView
       android:id="@+id/act_recyclerview"
       android:layout_width="match_parent"
       android:layout_height="match_parent">

   </android.support.v7.widget.RecyclerView>

</android.support.constraint.ConstraintLayout>

row_data.xml

<?xml version="1.0" encoding="utf-8"?>
<android.support.v7.widget.CardView xmlns:android="http://schemas.android.com/apk/res/android"
   android:layout_width="match_parent"
   android:layout_height="wrap_content"
   android:layout_margin="5dp">

   <LinearLayout
       android:layout_width="match_parent"
       android:layout_height="wrap_content"
       android:orientation="vertical">

       <TextView
           android:id="@+id/row_tv_blog_title"
           android:layout_width="match_parent"
           android:layout_height="wrap_content"
           android:layout_margin="5dp"
           android:textStyle="bold" />

       <TextView
           android:id="@+id/row_tv_blog_author"
           android:layout_width="match_parent"
           android:layout_height="wrap_content"
           android:layout_margin="5dp" />

       <TextView
           android:id="@+id/row_tv_blog_upload_date"
           android:layout_width="match_parent"
           android:layout_height="wrap_content"
           android:layout_margin="5dp" />
   </LinearLayout>
</android.support.v7.widget.CardView>

MainActivity.java

public class MainActivity extends AppCompatActivity {

   private ProgressDialog mProgressDialog;
   private String url = "http://www.yudiz.com/blog/";
   private ArrayList<String> mAuthorNameList = new ArrayList<>();
   private ArrayList<String> mBlogUploadDateList = new ArrayList<>();
   private ArrayList<String> mPaginationList = new ArrayList<>();
   private ArrayList<String> mBlogTitleList = new ArrayList<>();

   @Override
   protected void onCreate(Bundle savedInstanceState) {
       super.onCreate(savedInstanceState);
       setContentView(R.layout.activity_main);

       new Description().execute();

   }

   private class Description extends AsyncTask<Void, Void, Void> {
       @Override
       protected void onPreExecute() {
           super.onPreExecute();
           mProgressDialog = new ProgressDialog(MainActivity.this);
           mProgressDialog.setTitle("Android Basic JSoup Tutorial");
           mProgressDialog.setMessage("Loading...");
           mProgressDialog.setIndeterminate(false);
           mProgressDialog.show();
       }

       @Override
       protected Void doInBackground(Void... params) {
           try {
               // Connect to the web site
               Document mBlogDocument = Jsoup.connect(url).get();

               int mPaginationSize = mBlogDocument.select("div[class=pages]").select("a").size();

               for (int page = 0; page < mPaginationSize; page++) {

                   Elements mPageLinkTaga = mBlogDocument.select("div.pages a").eq(page);
                   String mPageLink = mPageLinkTaga.attr("href");

                   mPaginationList.add(mPageLink);
                   Log.i("TAG1", mPageLink);
               }

               for (int j = 0; j < mPaginationList.size(); j++) {
                   Document mBlogPagination = Jsoup.connect(mPaginationList.get(j)).get();

                   // Using Elements to get the Meta data
                   Elements mElementDataSize = mBlogPagination.select("div[class=author-date]");
                   // Locate the content attribute
                   int mElementSize = mElementDataSize.size();

                   for (int i = 0; i < mElementSize; i++) {
                       Elements mElementAuthorName = mBlogPagination.select("span[class=vcard author post-author test]").select("a").eq(i);
                       String mAuthorName = mElementAuthorName.text().trim().replace("\n", "").replace("\t", "").replace("\r", "").replace("\b", "");

                       Elements mElementBlogUploadDate = mBlogPagination.select("span[class=post-date updated]").eq(i);
                       String mBlogUploadDate = mElementBlogUploadDate.text();

                       Elements mElementBlogTitle = mBlogPagination.select("h2[class=entry-title]").select("a").eq(i);
                       String mBlogTitle = mElementBlogTitle.text();

                       mAuthorNameList.add(mAuthorName);
                       mBlogUploadDateList.add(mBlogUploadDate);
                       mBlogTitleList.add(mBlogTitle);
                   }
               }
           } catch (IOException e) {
               e.printStackTrace();
           }
           return null;
       }

       @Override
       protected void onPostExecute(Void result) {
           // Set description into TextView

           RecyclerView mRecyclerView = (RecyclerView) findViewById(R.id.act_recyclerview);

           DataAdapter mDataAdapter = new DataAdapter(MainActivity.this, mBlogTitleList, mAuthorNameList, mBlogUploadDateList);
           RecyclerView.LayoutManager mLayoutManager = new LinearLayoutManager(getApplicationContext());
           mRecyclerView.setLayoutManager(mLayoutManager);
           mRecyclerView.setAdapter(mDataAdapter);

           mProgressDialog.dismiss();
       }
   }
}

DataAdapter.java

public class DataAdapter extends RecyclerView.Adapter<DataAdapter.MyViewHolder> {

   private ArrayList<String> mBlogTitleList = new ArrayList<>();
   private ArrayList<String> mAuthorNameList = new ArrayList<>();
   private ArrayList<String> mBlogUploadDateList = new ArrayList<>();
   private Activity mActivity;
   private int lastPosition = -1;

   public DataAdapter(MainActivity activity, ArrayList<String> mBlogTitleList, ArrayList<String> mAuthorNameList, ArrayList<String> mBlogUploadDateList) {
       this.mActivity = activity;
       this.mBlogTitleList = mBlogTitleList;
       this.mAuthorNameList = mAuthorNameList;
       this.mBlogUploadDateList = mBlogUploadDateList;
   }

   public class MyViewHolder extends RecyclerView.ViewHolder {

       private TextView tv_blog_title, tv_blog_author, tv_blog_upload_date;

       public MyViewHolder(View view) {
           super(view);
           tv_blog_title = (TextView) view.findViewById(R.id.row_tv_blog_title);
           tv_blog_author = (TextView) view.findViewById(R.id.row_tv_blog_author);
           tv_blog_upload_date = (TextView) view.findViewById(R.id.row_tv_blog_upload_date);
       }
   }

   @Override
   public MyViewHolder onCreateViewHolder(ViewGroup parent, int viewType) {
       View itemView = LayoutInflater.from(parent.getContext())
               .inflate(R.layout.row_data, parent, false);

       return new MyViewHolder(itemView);
   }

   @Override
   public void onBindViewHolder(MyViewHolder holder, final int position) {
       holder.tv_blog_title.setText(mBlogTitleList.get(position));
       holder.tv_blog_author.setText(mAuthorNameList.get(position));
       holder.tv_blog_upload_date.setText(mBlogUploadDateList.get(position));
   }

   @Override
   public int getItemCount() {
       return mBlogTitleList.size();
   }
}

Step 3 : Test

pagination-image4


Viewing all articles
Browse latest Browse all 595

Trending Articles