class Abcdistill::Scraper
Public Instance Methods
addgoodreadsdotcom(linktail)
click to toggle source
# File lib/abcdistill/scraper.rb, line 35 def addgoodreadsdotcom(linktail) link = "https://www.goodreads.com#{linktail}" end
book_detail(bookinstance)
click to toggle source
scraper is a tool. not an object. so it doesn't store anything in some kind of @@all array i'm gonna make the methods instance methods becuase it feels that way. like u feed it very diff things.. idk..
# File lib/abcdistill/scraper.rb, line 5 def book_detail(bookinstance) #eventually we'll have this take premade book instance (made with title and link) #and we'll use the link to scrape other info and fill in. html = open('https://www.goodreads.com/book/show/45046808-big-lies-in-a-small-town') doc = Nokogiri::HTML(html) #retunrs xml nodeset authorname = doc.css("a.authorName").text #omg i'm so excited that this works! u can search by attribute pages = doc.css("div#details span[itemprop=\"numberOfPages\"]").text publishdate = (doc.css("div#details .row")[1].text).split("\n")[2].strip #this should say the date #that piece has really weird strucutre. spaces is asn item too. and u have to strip the space to make it look normal #lets hope this is stable across description = doc.css("div#description span")[1].text amazonlinktail = doc.css("ul.buyButtonBar a#buyButton").attribute("href").value amazonlink = addgoodreadsdotcom(amazonlinktail) #yes the link to amazon has a transfer link on goodred #i like to separate the task so that if i had to change css stuff, i can easily experiment neatly bookinstance.authorname = authorname bookinstance.pages = pages bookinstance.publishdate = publishdate bookinstance.description = description bookinstance.amazonlink = amazonlink end
books_in_genre(genre)
click to toggle source
def testing(genrelink)
#we'll be given a page , like https://www.goodreads.com/genres/art #we'll scrape all the book titles and its link. 15 of them. #we want this to create 15 book instsances and make sure they are SAVED html = open(genrelink) doc = Nokogiri::HTML(html) #retunrs xml nodeset allbigboxes = doc.css(".coverBigBox") puts allbigboxes.size allbigboxes.each do | box | puts box.css(".h2Container h2 a").text end
end
# File lib/abcdistill/scraper.rb, line 53 def books_in_genre(genre) #we'll be given a page , like https://www.goodreads.com/genres/art #we'll scrape all the book titles and its link. 15 of them. #we want this to create 15 book instsances and make sure they are SAVED #gatekeeper: to prevent duplicates, check if books in this genre is already fetched. if Abcdistill::Genre.books_of_genre(genre) != [] #cant do truthy falsey because empty array is truthy in ruby return end html = open(genre.genrelink) doc = Nokogiri::HTML(html) #retunrs xml nodeset genrename = doc.css(".genreHeader h1").text.strip puts "the genrename that was scraped is -#{genrename}-" if genrename == "Childrens" genrename = "Children's" end puts "new genrename is #{genrename}" #i'm fixing it manually because the url can be unreliable. say ...biography vs ...biography/ #using a split tool can be complicated to deal with edge cases like that genre = Abcdistill::Genre.find_genre_by_name(genrename) puts "did they find the genre in the list? the name found is #{genre.name}" #the genres were already added at the moment CLI class's list_options is called. #mostread = doc.css(".bigBoxBody")[1] allbigboxes = doc.css(".coverBigBox") mostread = allbigboxes.find do | box | box.css(".h2Container h2 a").text.include?("Most Read This Week") end #i used include because == doesn't work for many cases. it's not uniform. sometiems it's most read this week tagged Christian #the [0] index is not what we want. we just want the [1] #mostread only contains one "item" which is the bigboxbody we want books = mostread.css(".bookBox a") puts "how many books found in books = mostread.css.... #{books.size}" #this puts is really helpful for debugging. don't remove it #Distill::Genre.books_of_genre #we mut setup some how to only fetch books if they're not already fetched #to prevent duplicates #task NOT completed books.each do | book | title = book.css("img").attribute("alt").value booklink = addgoodreadsdotcom(book.attribute("href").value) #create a new book instance: Abcdistill::Book.new(title, booklink, genre) end puts "how many books of this genre is added and recognized: #{Abcdistill::Genre.books_of_genre(genre).size}" #puts Distill::Genre.books_of_genre(genre)[0] # Distill::Genre.books_of_genre(genre).each do | book | # puts book # puts book.title # end #we want to seprate tasks. so this should only fetch the books in that genre. but not display it #display should be a method for Genre or Book #notice how to make this kind of structure, the thing we're looping (titles), need each title to be something that is a ROOT #for ex: the alt value of the img element. the img element is a child of the .bookBox a element. .booxBox a leement is a "root" #the href is also belongs to the .bookbox a element. otherwise we can't make use of the loop structure #metadata #when we scrape each element seprately, we don't have to think about that at all, but as a loop its very different! #so don't think if i got each element down, putting it in a loop is simple. no! gotta see what they have in common and "refactor" #puts title = mostread.css(".bookBox a img")[0].attribute("alt").value # attribute("alt").value #linktail = mostread.css(".bookBox a")[0].attribute("href").value #ex: /book/show/53991683-the-woman-in-the-moonlight #puts booklink = addgoodreadsdotcom(linktail) #how she we organize this? #we never want to have to scrape anything twice. so we save everything #tehre should be some kinda all that host different genres #each genres has 15 hashes of book title and links #as we use book_detail, we'll add to the 15 hashes other properties, like author, pages, etc #but i dont know how should we organize all this #in classes named genres and books? #apprently they want objects, which is instances of class, so we'll probably do that.. #make it life-like i guess #we need to loop through each book. this might take more cleaning effort to loop through the right things end