import scrapy
import re
import urlparse
import socket
import datetime
import logging
import tldextract
from domain_spider.items import DomainSpiderItem
DOMAINS= []
DBD_DOMAINS = ['youtube.com', 'tumblr.com', 'facebook.com','google.com', 'twitter.com', 'yahoo.com','apple.com']
def get_domain(url):
domain = urlparse.urlparse(url).netloc
return domain
def items(item,domain, ip):
item['domain'] = domain
item['date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return item
class TestSpider(scrapy.Spider):
name = "Test"
start_urls = [
'https://xxx.com/',
]
def parse(self, response):
item = DomainSpiderItem()
try:
for url in response.xpath('//a/@href').extract():
if re.match('^http',url):
for x in DBD_DOMAINS :
if x in url:
break
else:
domain = get_domain(url)
ip = domain_get_ip(get_domain(url))
items(item, domain, ip)
yield item
yield scrapy.Request(url,callback=self.parse, dont_filter=True)
except Exception as error:
logging.log(logging.WARNING,error)