Predicting_book_genres

from hashlib import new
from unittest import result
from bs4 import BeautifulSoup 
import requests
import csv
from itertools import zip_longest

for this project i webscraped abjjad using the BeautifulSoup python package

new_book_name =[]
new_author =[]
new_cover_url = []
next_page_links =[]
new_genres = []
new_descriptions = []
page_number = 0

first we get access to the website using the requests package from the main page we collect the book name, author, cover url and the link to the book detail page due to the structure of the website(no main page that contains all the books i found it easier to proccessess every genre web page separately thereby but i had to change the link multiple time

while page_number<20:

    result = requests.get(f'https://www.abjjad.com/books/220759001/%D8%B1%D9%88%D8%A7%D9%8A%D8%A7%D8%AA-%D9%88%D9%82%D8%B5%D8%B5/{page_number}/')       
    src = result.content


    goodsoup = BeautifulSoup(src,"lxml")

     ## book title
    book_name = goodsoup.find_all("a", {"data-ga":"BookBadge_Title"})
    
    ## author name
    author = goodsoup.find_all('span',{"class":"author"})
 
    cover_url = goodsoup.find_all("a",{"class":"img"})

    #saving the detail page link into an array to get acceses to more data 
    for i in range(len(book_name)):
        next_page_links.append("https://www.abjjad.com" + str(book_name[i]).split('href="')[1].lstrip().split('">')[0]) 

    for i in range(len(book_name)):        
        NremovedLinks = str(book_name[i]).split('>')[1].lstrip().split('<')[0]
        NremovedR = NremovedLinks.replace('\r', '')
        new_book_name.append(NremovedR.replace('\n',''))

        AremovedLinks = str(author[i].text)
        AremovedR = AremovedLinks.replace('\r', '')
        new_author.append(AremovedR.replace('\n',''))
        new_cover_url.append(str(cover_url[i]).split('src="')[1].lstrip().split('"/>')[0])

this loop gets the descriptions and genres from the book detail page


for link in next_page_links:
    
    secondresult = requests.get(link)
    secondsrc = secondresult.content
    secondgoodsoup = BeautifulSoup(secondsrc,'lxml') 
    genres = secondgoodsoup.find("ul", {"itemprop":"genre" })
    output_text = ""
    for item in genres.find_all('li'): 
        output_text += item.text+"|"
       
    new_genres.append(output_text)

    desc = secondgoodsoup.find("span" ,{"itemprop":"description" , "class":"content"})
    DremovedLinks = str(desc).split('>')[1].lstrip().split('<')[0]
    DremovedR = DremovedLinks.replace('\r', '')
    new_descriptions.append(DremovedR.replace('\n',''))

asave the collected dataset to CSV file

file_list = [new_book_name,new_author,new_cover_url,new_genres,new_descriptions]
expo = zip_longest(*file_list)
with open("C:\\Users\\Lana\\Desktop\\Misk_DSI_capstone\\data\\abjjad.csv" , "w" , encoding="utf-8-sig") as myfile:
    writerobj = csv.writer(myfile)
    writerobj.writerow(["Book_title","Author","Cover_url","genres","descriptions"])
    writerobj.writerows(expo)