-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathscraper.js
More file actions
46 lines (39 loc) · 1.18 KB
/
scraper.js
File metadata and controls
46 lines (39 loc) · 1.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
var request = require('request');
var cheerio = require('cheerio');
/* Scrapes the given URL for image links specified by the provided selector.
* Calls the callback with a list of URLs.
*
* Arguments:
* url -- the URL to scrape
* selector -- the CSS selector that identifies the image links
* callback -- the callback to call with the list of URLs
*/
exports.scrapeImageURLs = function(url, selector, callback) {
request.get(url, function(error, response, html) {
// exit on error or bad response code
if (error) {
throw error;
}
if (response.statusCode !== 200) {
throw new Exception("Didn't get a 200 status code.");
}
callback(findImageURLs(html, selector));
});
};
/* Finds image links in the given HTML that match the provided selector.
* Returns a list of links.
*
* Arguments:
* html -- the HTML to parse
* selector -- the CSS selector that identifies the image links
*/
function findImageURLs(html, selector) {
// parse the HTML jQuery-style
var $ = cheerio.load(html);
var urls = [];
// image sources are the href's of .thumbnail anchors
$(selector).each(function() {
urls.push($(this).attr('href'));
});
return urls;
}