from hashlib import new
from unittest import result
from bs4 import BeautifulSoup
import requests
import csv
from itertools import zip_longestfor this project i webscraped abjjad using the BeautifulSoup python package
new_book_name =[]
new_author =[]
new_cover_url = []
next_page_links =[]
new_genres = []
new_descriptions = []
page_number = 0first we get access to the website using the requests package from the main page we collect the book name, author, cover url and the link to the book detail page due to the structure of the website(no main page that contains all the books i found it easier to proccessess every genre web page separately thereby but i had to change the link multiple time
while page_number<20:
result = requests.get(f'https://www.abjjad.com/books/220759001/%D8%B1%D9%88%D8%A7%D9%8A%D8%A7%D8%AA-%D9%88%D9%82%D8%B5%D8%B5/{page_number}/')
src = result.content
goodsoup = BeautifulSoup(src,"lxml")
## book title
book_name = goodsoup.find_all("a", {"data-ga":"BookBadge_Title"})
## author name
author = goodsoup.find_all('span',{"class":"author"})
cover_url = goodsoup.find_all("a",{"class":"img"})
#saving the detail page link into an array to get acceses to more data
for i in range(len(book_name)):
next_page_links.append("https://www.abjjad.com" + str(book_name[i]).split('href="')[1].lstrip().split('">')[0])
for i in range(len(book_name)):
NremovedLinks = str(book_name[i]).split('>')[1].lstrip().split('<')[0]
NremovedR = NremovedLinks.replace('\r', '')
new_book_name.append(NremovedR.replace('\n',''))
AremovedLinks = str(author[i].text)
AremovedR = AremovedLinks.replace('\r', '')
new_author.append(AremovedR.replace('\n',''))
new_cover_url.append(str(cover_url[i]).split('src="')[1].lstrip().split('"/>')[0])this loop gets the descriptions and genres from the book detail page
for link in next_page_links:
secondresult = requests.get(link)
secondsrc = secondresult.content
secondgoodsoup = BeautifulSoup(secondsrc,'lxml')
genres = secondgoodsoup.find("ul", {"itemprop":"genre" })
output_text = ""
for item in genres.find_all('li'):
output_text += item.text+"|"
new_genres.append(output_text)
desc = secondgoodsoup.find("span" ,{"itemprop":"description" , "class":"content"})
DremovedLinks = str(desc).split('>')[1].lstrip().split('<')[0]
DremovedR = DremovedLinks.replace('\r', '')
new_descriptions.append(DremovedR.replace('\n',''))
asave the collected dataset to CSV file
file_list = [new_book_name,new_author,new_cover_url,new_genres,new_descriptions]
expo = zip_longest(*file_list)
with open("C:\\Users\\Lana\\Desktop\\Misk_DSI_capstone\\data\\abjjad.csv" , "w" , encoding="utf-8-sig") as myfile:
writerobj = csv.writer(myfile)
writerobj.writerow(["Book_title","Author","Cover_url","genres","descriptions"])
writerobj.writerows(expo)