Overview
Jsoup iterate all elements of HTML illustration and demonstrates to choose & repeat all elements of HTML document utilizing Jsoup.
Jsoup provides select technique which acknowledges CSS style selectors to choose the elements.
Click here for start with basic of Data Scraping in Android using JSOUP
Now we will be scraping all the data from the pagination of the blogger page of Yudiz. And we will display it in RecyclerView.
Steps :-
- First of all we need to find the total number of the pages available in blog page.
- We store all the page url in an ArrayList.
- Now we will connect with each and every url and get all needed data from it.
Step 1 : HTML Source Code
We will use http://www.yudiz.com/blog/ for a data scraping of this webpage.
Total Number of pages HTML Code:-
<div class="pages"> <a href="http://www.yudiz.com/blog/" class="page active">1</a> <a href="http://www.yudiz.com/blog/page/2/" class="page">2</a> <a href="http://www.yudiz.com/blog/page/3/" class="page">3</a> <a href="http://www.yudiz.com/blog/page/4/" class="page">4</a> <a href="http://www.yudiz.com/blog/page/5/" class="page">5</a> <a href="http://www.yudiz.com/blog/page/6/" class="page">6</a> <a href="http://www.yudiz.com/blog/page/7/" class="page">7</a> <a href="http://www.yudiz.com/blog/page/8/" class="page">8</a> </div>
Author Name HTML Code:-
<span class="vcard author post-author test"> <a href="http://www.yudiz.com/author/sandeep-joshi/"> Sandeep Joshi </a> </span>
Blog Upload Date HTML Code:-
<span class="post-date updated">November 24, 2017</span>
Blog Title HTML Code:-
<div class="post-title"> <h2 class="entry-title" itemprop="headline"> <a href="http://www.yudiz.com/how-to-customize-your-app-icon/"> How to customize your app icon? </a> </h2> </div>
Note:- For Scraping you must have to find the unique HTML element tag for necessary field otherwise you should have to find some other HTML element if the same HTML element is used for other purposes.
Step 2 : Android Source Code
Permissions to be needed in Manifest.xml :-
<uses-permission android:name="android.permission.INTERNET" />
Gradle Dependencies to be add :-
dependencies { implementation 'org.jsoup:jsoup:1.11.2' }
activity_main.xml
<?xml version="1.0" encoding="utf-8"?> <android.support.constraint.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android" xmlns:app="http://schemas.android.com/apk/res-auto" xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent" android:layout_height="match_parent" tools:context="com.jsoupdemo.MainActivity"> <android.support.v7.widget.RecyclerView android:id="@+id/act_recyclerview" android:layout_width="match_parent" android:layout_height="match_parent"> </android.support.v7.widget.RecyclerView> </android.support.constraint.ConstraintLayout>
row_data.xml
<?xml version="1.0" encoding="utf-8"?> <android.support.v7.widget.CardView xmlns:android="http://schemas.android.com/apk/res/android" android:layout_width="match_parent" android:layout_height="wrap_content" android:layout_margin="5dp"> <LinearLayout android:layout_width="match_parent" android:layout_height="wrap_content" android:orientation="vertical"> <TextView android:id="@+id/row_tv_blog_title" android:layout_width="match_parent" android:layout_height="wrap_content" android:layout_margin="5dp" android:textStyle="bold" /> <TextView android:id="@+id/row_tv_blog_author" android:layout_width="match_parent" android:layout_height="wrap_content" android:layout_margin="5dp" /> <TextView android:id="@+id/row_tv_blog_upload_date" android:layout_width="match_parent" android:layout_height="wrap_content" android:layout_margin="5dp" /> </LinearLayout> </android.support.v7.widget.CardView>
MainActivity.java
public class MainActivity extends AppCompatActivity { private ProgressDialog mProgressDialog; private String url = "http://www.yudiz.com/blog/"; private ArrayList<String> mAuthorNameList = new ArrayList<>(); private ArrayList<String> mBlogUploadDateList = new ArrayList<>(); private ArrayList<String> mPaginationList = new ArrayList<>(); private ArrayList<String> mBlogTitleList = new ArrayList<>(); @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_main); new Description().execute(); } private class Description extends AsyncTask<Void, Void, Void> { @Override protected void onPreExecute() { super.onPreExecute(); mProgressDialog = new ProgressDialog(MainActivity.this); mProgressDialog.setTitle("Android Basic JSoup Tutorial"); mProgressDialog.setMessage("Loading..."); mProgressDialog.setIndeterminate(false); mProgressDialog.show(); } @Override protected Void doInBackground(Void... params) { try { // Connect to the web site Document mBlogDocument = Jsoup.connect(url).get(); int mPaginationSize = mBlogDocument.select("div[class=pages]").select("a").size(); for (int page = 0; page < mPaginationSize; page++) { Elements mPageLinkTaga = mBlogDocument.select("div.pages a").eq(page); String mPageLink = mPageLinkTaga.attr("href"); mPaginationList.add(mPageLink); Log.i("TAG1", mPageLink); } for (int j = 0; j < mPaginationList.size(); j++) { Document mBlogPagination = Jsoup.connect(mPaginationList.get(j)).get(); // Using Elements to get the Meta data Elements mElementDataSize = mBlogPagination.select("div[class=author-date]"); // Locate the content attribute int mElementSize = mElementDataSize.size(); for (int i = 0; i < mElementSize; i++) { Elements mElementAuthorName = mBlogPagination.select("span[class=vcard author post-author test]").select("a").eq(i); String mAuthorName = mElementAuthorName.text().trim().replace("\n", "").replace("\t", "").replace("\r", "").replace("\b", ""); Elements mElementBlogUploadDate = mBlogPagination.select("span[class=post-date updated]").eq(i); String mBlogUploadDate = mElementBlogUploadDate.text(); Elements mElementBlogTitle = mBlogPagination.select("h2[class=entry-title]").select("a").eq(i); String mBlogTitle = mElementBlogTitle.text(); mAuthorNameList.add(mAuthorName); mBlogUploadDateList.add(mBlogUploadDate); mBlogTitleList.add(mBlogTitle); } } } catch (IOException e) { e.printStackTrace(); } return null; } @Override protected void onPostExecute(Void result) { // Set description into TextView RecyclerView mRecyclerView = (RecyclerView) findViewById(R.id.act_recyclerview); DataAdapter mDataAdapter = new DataAdapter(MainActivity.this, mBlogTitleList, mAuthorNameList, mBlogUploadDateList); RecyclerView.LayoutManager mLayoutManager = new LinearLayoutManager(getApplicationContext()); mRecyclerView.setLayoutManager(mLayoutManager); mRecyclerView.setAdapter(mDataAdapter); mProgressDialog.dismiss(); } } }
DataAdapter.java
public class DataAdapter extends RecyclerView.Adapter<DataAdapter.MyViewHolder> { private ArrayList<String> mBlogTitleList = new ArrayList<>(); private ArrayList<String> mAuthorNameList = new ArrayList<>(); private ArrayList<String> mBlogUploadDateList = new ArrayList<>(); private Activity mActivity; private int lastPosition = -1; public DataAdapter(MainActivity activity, ArrayList<String> mBlogTitleList, ArrayList<String> mAuthorNameList, ArrayList<String> mBlogUploadDateList) { this.mActivity = activity; this.mBlogTitleList = mBlogTitleList; this.mAuthorNameList = mAuthorNameList; this.mBlogUploadDateList = mBlogUploadDateList; } public class MyViewHolder extends RecyclerView.ViewHolder { private TextView tv_blog_title, tv_blog_author, tv_blog_upload_date; public MyViewHolder(View view) { super(view); tv_blog_title = (TextView) view.findViewById(R.id.row_tv_blog_title); tv_blog_author = (TextView) view.findViewById(R.id.row_tv_blog_author); tv_blog_upload_date = (TextView) view.findViewById(R.id.row_tv_blog_upload_date); } } @Override public MyViewHolder onCreateViewHolder(ViewGroup parent, int viewType) { View itemView = LayoutInflater.from(parent.getContext()) .inflate(R.layout.row_data, parent, false); return new MyViewHolder(itemView); } @Override public void onBindViewHolder(MyViewHolder holder, final int position) { holder.tv_blog_title.setText(mBlogTitleList.get(position)); holder.tv_blog_author.setText(mAuthorNameList.get(position)); holder.tv_blog_upload_date.setText(mBlogUploadDateList.get(position)); } @Override public int getItemCount() { return mBlogTitleList.size(); } }