第八节:Python实战:完善排名查询(网上seo培训)

建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码，并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程，不一定是自己写出来，可以复制一段段的代码拼出最终功能，但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码，并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程，不一定是自己写出来，可以复制一段段的代码拼出最终功能，但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
01

import re, urllib2, urllib, sys

02
03

def get_urls(kw):

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

urls = re.findall(‘\s*(.+?) ‘, html)

urls = [ ‘http://’ + url.replace(”, ”).replace(”, ”) for url in urls ]

return urls

08
09

input_file = sys.argv[1]

self_domain = sys.argv[2]

competitor_domain = sys.argv[3]

12
13

self = 0

competitor = 0

for line in open(input_file):

kw = line.rstrip()

urls = get_urls(kw)

18
19

self_find = False

competitor_find = False

for pos, url in enumerate(urls, 1):

if not self_find and self_domain in url:

self += 100 – pos

self_find = True

if not competitor_find and competitor_domain in url:

competitor += 100 – pos

competitor_find = True

print self, competitor

下半节

import re, urllib2, urllib, sys, time

02
03

date = time.strftime(‘%Y-%m-%d’)

04
05

rank_score = [

28.56,

19.23,

10.20,

8.14,

7.50,

5.72,

4.01,

4.41,

5.53,

6.70,

]

17
18

def get_urls(kw):

while 1:

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

if ‘=”http://verify.baidu.com/’ in html:

print ‘captcha’

time.sleep(600)

continue

else:

break

urls = re.findall(‘\s*(.+?) ‘, html)

urls = [ ‘http://’ + url for url in urls ]

return urls

30
31

input_file = sys.argv[1]

self_domain = sys.argv[2]

competitor_domain = sys.argv[3]

34
35

f = open(‘rank_report.csv’, ‘a’)

36
37

self = 0

competitor = 0

for line in open(input_file):

kw = line.rstrip()

urls = get_urls(kw)

42
43

self_find = False

competitor_find = False

for pos, url in enumerate(urls, 1):

if not self_find and self_domain in url:

if pos<=10:

self += rank_score[pos-1]

self_find = True

if not competitor_find and competitor_domain in url:

if pos<=10:

competitor += rank_score[pos-1]

competitor_find = True

f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))

f.close()

上节课后习题参考答案:

import sys, urllib, urllib2, re

02
03

kw = sys.argv[1] # keyword

host = sys.argv[2]

05
06

rank = -1

for pn in range(1, 10+1):

html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()

urls = re.findall(‘class=”res-list”>[\s\S]*?, html)

for pos, url in enumerate(urls, 1):

if host in url:

rank = pos + (pn-1)*10

break

if rank!=-1:

break

print rank

# coding:utf-8

import sys, urllib, urllib2

def is_indexed(url):

while 1:

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()

if ‘=”http://verify.baidu.com’ in html:

print ‘captcha’

sleep(600)

continue

if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>

抱歉，没有找到与’ in html:

return 0

else:

return 1

14
15

total = 0

count = 0

for line in open(sys.argv[1]):

url = line.rstrip()

indexed = is_indexed(url)

print url, indexed

21
22

if indexed==1:

count += 1

total += 1

25
26

print ‘收录率:%d%%’ % (count * 100 / total)

print self, competitor

下半节

import re, urllib2, urllib, sys, time

02
03

date = time.strftime(‘%Y-%m-%d’)

04
05

rank_score = [

28.56,

19.23,

10.20,

8.14,

7.50,

5.72,

4.01,

4.41,

5.53,

6.70,

]

17
18

def get_urls(kw):

while 1:

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

if ‘=”http://verify.baidu.com/’ in html:

print ‘captcha’

time.sleep(600)

continue

else:

break

urls = re.findall(‘\s*(.+?) ‘, html)

urls = [ ‘http://’ + url for url in urls ]

return urls

30
31

input_file = sys.argv[1]

self_domain = sys.argv[2]

competitor_domain = sys.argv[3]

34
35

f = open(‘rank_report.csv’, ‘a’)

36
37

self = 0

competitor = 0

for line in open(input_file):

kw = line.rstrip()

urls = get_urls(kw)

42
43

self_find = False

competitor_find = False

for pos, url in enumerate(urls, 1):

if not self_find and self_domain in url:

if pos<=10:

self += rank_score[pos-1]

self_find = True

if not competitor_find and competitor_domain in url:

if pos<=10:

competitor += rank_score[pos-1]

competitor_find = True

f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))

f.close()

上节课后习题参考答案:

import sys, urllib, urllib2, re

02
03

kw = sys.argv[1] # keyword

host = sys.argv[2]

05
06

rank = -1

for pn in range(1, 10+1):

html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()

urls = re.findall(‘class=”res-list”>[\s\S]*?, html)

for pos, url in enumerate(urls, 1):

if host in url:

rank = pos + (pn-1)*10

break

if rank!=-1:

break

print rank

# coding:utf-8

import sys, urllib, urllib2

def is_indexed(url):

while 1:

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()

if ‘=”http://verify.baidu.com’ in html:

print ‘captcha’

sleep(600)

continue

if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>
抱歉，没有找到与’ in html:

return 0

else:

return 1

14
15

total = 0

count = 0

for line in open(sys.argv[1]):

url = line.rstrip()

indexed = is_indexed(url)

print url, indexed

21
22

if indexed==1:

count += 1

total += 1

25
26

print ‘收录率:%d%%’ % (count * 100 / total)