博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Python登录人人网并抓取新鲜事
阅读量:4611 次
发布时间:2019-06-09

本文共 2166 字,大约阅读时间需要 7 分钟。

from
sgmllib
import
SGMLParser
import
sys,urllib2,urllib,cookielib
class
spider(SGMLParser):
    
def
__init__(
self
,email,password):
        
SGMLParser.__init__(
self
)
        
self
.h3
=
False
        
self
.h3_is_ready
=
False
        
self
.div
=
False
        
self
.h3_and_div
=
False
        
self
.a
=
False
        
self
.depth
=
0
        
self
.names
=
""
        
self
.dic
=
{}  
          
        
self
.email
=
email
        
self
.password
=
password
        
self
.domain
=
'renren.com'
        
try
:
            
cookie
=
cookielib.CookieJar()
            
cookieProc
=
urllib2.HTTPCookieProcessor(cookie)
        
except
:
            
raise
        
else
:
            
opener
=
urllib2.build_opener(cookieProc)
            
urllib2.install_opener(opener)      
 
    
def
login(
self
):
        
url
=
'http://www.renren.com/PLogin.do'
        
postdata
=
{
                  
'email'
:
self
.email,
                  
'password'
:
self
.password,
                  
'domain'
:
self
.domain 
                  
}
        
req
=
urllib2.Request(
                            
url,
                            
urllib.urlencode(postdata)           
                            
)
         
        
self
.
file
=
urllib2.urlopen(req).read()
        
#print self.file
    
def
start_h3(
self
,attrs):
        
self
.h3
=
True
    
def
end_h3(
self
):
        
self
.h3
=
False
        
self
.h3_is_ready
=
True
         
    
def
start_a(
self
,attrs):
        
if
self
.h3
or
self
.div:
            
self
.a
=
True
    
def
end_a(
self
):
        
self
.a
=
False
         
    
def
start_div(
self
,attrs):
        
if
self
.h3_is_ready
=
=
False
:
            
return
        
if
self
.div
=
=
True
:
            
self
.depth
+
=
1
             
        
for
k,v
in
attrs:
            
if
k
=
=
'class'
and
v
=
=
'content'
:
                
self
.div
=
True
;
                
self
.h3_and_div
=
True  
#h3 and div is connected
    
def
end_div(
self
):
        
if
self
.depth
=
=
0
:
            
self
.div
=
False
            
self
.h3_and_div
=
False
            
self
.h3_is_ready
=
False
            
self
.names
=
""
        
if
self
.div
=
=
True
:
            
self
.depth
-
=
1
    
def
handle_data(
self
,text):
        
#record the name
        
if
self
.h3
and
self
.a:
            
self
.names
+
=
text
        
#record says
        
if
self
.h3
and
(
self
.a
=
=
False
):
            
if
not
text:
pass
            
else
:
self
.dic.setdefault(
self
.names,[]).append(text)
            
return
        
if
self
.h3_and_div:
            
self
.dic.setdefault(
self
.names,[]).append(text)
             
    
def
show(
self
):
        
type
=
sys.getfilesystemencoding()
        
for
key
in
self
.dic:
            
print
( ('
'.join(key)).replace('
','
')).decode('
utf
-
8
').encode(
type
), \
                  
( ('
'.join(self.dic[key])).replace('
','
')).decode('
utf
-
8
').encode(
type
)
 
 
 
 
renrenspider
=
spider(
'your email'
,
'your password'
)
renrenspider.login()
renrenspider.feed(renrenspider.
file
)
renrenspider.show()

转载于:https://www.cnblogs.com/hd-zg/p/4932844.html

你可能感兴趣的文章
git warning: LF will be replaced by CRLF in 解决办法
查看>>
浅谈MVP设计模式
查看>>
深入理解PHP中的引用和赋值
查看>>
红黑树
查看>>
(转载)maven profile多环境自动切换配置
查看>>
py三个面试小问题
查看>>
图像类推效果图
查看>>
php pdo_mysql使用方法
查看>>
Android驱动开发第二章随想
查看>>
String API
查看>>
O(1)纬度减少循环次数
查看>>
绑定域名到 GitHub Pages
查看>>
javaweb-简单的验证码和算术验证码
查看>>
深入理解Javascript系列之类型
查看>>
DateTime数据类型保存问题(DateTime2)
查看>>
【算法学习】【洛谷】cdq分治 & P3810 三维偏序
查看>>
1025 反转链表 (25 分)
查看>>
基于Pojo的开发模式(day03)
查看>>
jQuery input -> file change事件bug
查看>>
前端开发 - CSS - 上
查看>>