第八节:Python实战:完善排名查询(网上seo培训)


建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码,并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程,不一定是自己写出来,可以复制一段段的代码拼出最终功能,但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
建议参加者:
程序经验不丰富的培训参加者。
课程内容简介:
完善查排名的代码,并给它加些功能来监控自己和竞争对手的排名。
课后:
复习这些天的课程,不一定是自己写出来,可以复制一段段的代码拼出最终功能,但需要做到能够流畅的写出代码。
上半节
windows里面的sys.argv[1]巧用–拖拽文件。(需要与raw_input()一起用)
代码1:
01

import re, urllib2, urllib, sys

02
03

def get_urls(kw):

04

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

05

urls = re.findall(‘\s*(.+?) ‘, html)

06

urls = [ ‘http://’ + url.replace(”, ”).replace(”, ”) for url in urls ]

07

return urls

08
09

input_file = sys.argv[1]

10

self_domain = sys.argv[2]

11

competitor_domain = sys.argv[3]

12
13

self = 0

14

competitor = 0

15

for line in open(input_file):

16

kw = line.rstrip()

17

urls = get_urls(kw)

18
19

self_find = False

20

competitor_find = False

21

for pos, url in enumerate(urls, 1):

22

if not self_find and self_domain in url:

23

self += 100 – pos

24

self_find = True

25

if not competitor_find and competitor_domain in url:

26

competitor += 100 – pos

27

competitor_find = True

28

print self, competitor

下半节

01

import re, urllib2, urllib, sys, time

02
03

date = time.strftime(‘%Y-%m-%d’)

04
05

rank_score = [

06

28.56,

07

19.23,

08

10.20,

09

8.14,

10

7.50,

11

5.72,

12

4.01,

13

4.41,

14

5.53,

15

6.70,

16

]

17
18

def get_urls(kw):

19

while 1:

20

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

21

if ‘=”http://verify.baidu.com/’ in html:

22

print ‘captcha’

23

time.sleep(600)

24

continue

25

else:

26

break

27

urls = re.findall(‘\s*(.+?) ‘, html)

28

urls = [ ‘http://’ + url for url in urls ]

29

return urls

30
31

input_file = sys.argv[1]

32

self_domain = sys.argv[2]

33

competitor_domain = sys.argv[3]

34
35

f = open(‘rank_report.csv’, ‘a’)

36
37

self = 0

38

competitor = 0

39

for line in open(input_file):

40

kw = line.rstrip()

41

urls = get_urls(kw)

42
43

self_find = False

44

competitor_find = False

45

for pos, url in enumerate(urls, 1):

46

if not self_find and self_domain in url:

47

if pos<=10:

48

self += rank_score[pos-1]

49

self_find = True

50

if not competitor_find and competitor_domain in url:

51

if pos<=10:

52

competitor += rank_score[pos-1]

53

competitor_find = True

54

f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))

f.close()

上节课后习题参考答案:

01

import sys, urllib, urllib2, re

02
03

kw = sys.argv[1] # keyword

04

host = sys.argv[2]

05
06

rank = -1

07

for pn in range(1, 10+1):

08

html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()

09

urls = re.findall(‘class=”res-list”>[\s\S]*?, html)

10

for pos, url in enumerate(urls, 1):

11

if host in url:

12

rank = pos + (pn-1)*10

13

break

14

if rank!=-1:

15

break

16

print rank

01

# coding:utf-8

02

import sys, urllib, urllib2

03

def is_indexed(url):

04

while 1:

05

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()

06

if ‘=”http://verify.baidu.com’ in html:

07

print ‘captcha’

08

sleep(600)

09

continue

10

if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>

抱歉,没有找到与’ in html:

11

return 0

12

else:

13

return 1

14
15

total = 0

16

count = 0

17

for line in open(sys.argv[1]):

18

url = line.rstrip()

19

indexed = is_indexed(url)

20

print url, indexed

21
22

if indexed==1:

23

count += 1

24

total += 1

25
26

print ‘收录率:%d%%’ % (count * 100 / total)

print self, competitor

下半节

01

import re, urllib2, urllib, sys, time

02
03

date = time.strftime(‘%Y-%m-%d’)

04
05

rank_score = [

06

28.56,

07

19.23,

08

10.20,

09

8.14,

10

7.50,

11

5.72,

12

4.01,

13

4.41,

14

5.53,

15

6.70,

16

]

17
18

def get_urls(kw):

19

while 1:

20

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s&rn=100’ % urllib.quote_plus(kw)).read()

21

if ‘=”http://verify.baidu.com/’ in html:

22

print ‘captcha’

23

time.sleep(600)

24

continue

25

else:

26

break

27

urls = re.findall(‘\s*(.+?) ‘, html)

28

urls = [ ‘http://’ + url for url in urls ]

29

return urls

30
31

input_file = sys.argv[1]

32

self_domain = sys.argv[2]

33

competitor_domain = sys.argv[3]

34
35

f = open(‘rank_report.csv’, ‘a’)

36
37

self = 0

38

competitor = 0

39

for line in open(input_file):

40

kw = line.rstrip()

41

urls = get_urls(kw)

42
43

self_find = False

44

competitor_find = False

45

for pos, url in enumerate(urls, 1):

46

if not self_find and self_domain in url:

47

if pos<=10:

48

self += rank_score[pos-1]

49

self_find = True

50

if not competitor_find and competitor_domain in url:

51

if pos<=10:

52

competitor += rank_score[pos-1]

53

competitor_find = True

54

f.write(‘%s\t%d\t%d\r\n’ % (date, self, competitor))

f.close()

上节课后习题参考答案:

01

import sys, urllib, urllib2, re

02
03

kw = sys.argv[1] # keyword

04

host = sys.argv[2]

05
06

rank = -1

07

for pn in range(1, 10+1):

08

html = urllib2.urlopen(‘http://www.so.com/s?q=%s&pn=%d’ % (urllib.quote_plus(kw),pn)).read()

09

urls = re.findall(‘class=”res-list”>[\s\S]*?, html)

10

for pos, url in enumerate(urls, 1):

11

if host in url:

12

rank = pos + (pn-1)*10

13

break

14

if rank!=-1:

15

break

16

print rank

01

# coding:utf-8

02

import sys, urllib, urllib2

03

def is_indexed(url):

04

while 1:

05

html = urllib2.urlopen(‘http://www.baidu.com/s?wd=%s’ % urllib.quote_plus(url)).read()

06

if ‘=”http://verify.baidu.com’ in html:

07

print ‘captcha’

08

sleep(600)

09

continue

10

if ‘class=f14>没有找到该URL。您可以直接访问’ in html or ‘class=”nors”>
抱歉,没有找到与’ in html:

11

return 0

12

else:

13

return 1

14
15

total = 0

16

count = 0

17

for line in open(sys.argv[1]):

18

url = line.rstrip()

19

indexed = is_indexed(url)

20

print url, indexed

21
22

if indexed==1:

23

count += 1

24

total += 1

25
26

print ‘收录率:%d%%’ % (count * 100 / total)