🥷
🥷

网页内容自动抽取与词云展示

最近接到一批数据需要分析,当然常规的就是先用bs4解析处理,提取内容。然而很早之前就听说过了网页内容自动抽取,于是就尝试了下,CxExtractor来自cx-extractor-python

目前我了解到的网页自动抽取方式有:

  • 基于行块分布
  • 基于文本密度

本篇中尝试的方法为基于行块分布的。CxExtractor
img

读取->提取->过滤即可

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import chardet
import requests

class CxExtractor:
"""cx-extractor implemented in Python"""

__text = []
# __threshold = 186
__indexDistribution = []
# __blocksWidth = 3

def __init__(self, threshold=86, blocksWidth=3):
self.__blocksWidth = blocksWidth
self.__threshold = threshold

def getText(self, content):
if self.__text:
self.__text = []
lines = content.split('\n')
for i in range(len(lines)):
# lines[i] = lines[i].replace("\\n", "")
if lines[i] == ' ' or lines[i] == '\n':
lines[i] = ''
self.__indexDistribution.clear()
for i in range(0, len(lines) - self.__blocksWidth):
wordsNum = 0
for j in range(i, i + self.__blocksWidth):
lines[j] = lines[j].replace("\\s", "")
wordsNum += len(lines[j])
self.__indexDistribution.append(wordsNum)
start = -1
end = -1
boolstart = False
boolend = False
for i in range(len(self.__indexDistribution) - 1):
if(self.__indexDistribution[i] > self.__threshold and (not boolstart)):
if (self.__indexDistribution[i + 1] != 0 or self.__indexDistribution[i + 2] != 0 or self.__indexDistribution[i + 3] != 0):
boolstart = True
start = i
continue
if (boolstart):
if (self.__indexDistribution[i] == 0 or self.__indexDistribution[i + 1] == 0):
end = i
boolend = True
tmp = []
if(boolend):
for ii in range(start, end + 1):
if(len(lines[ii]) < 5):
continue
tmp.append(lines[ii] + "\n")
str = "".join(list(tmp))
if ("Copyright" in str or "版权所有" in str):
continue
self.__text.append(str)
boolstart = boolend = False
result = "".join(list(self.__text))
return result

def replaceCharEntity(self, htmlstr):
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
re_charEntity = re.compile(r'&#?(?P<name>\w+);')
sz = re_charEntity.search(htmlstr)
while sz:
entity = sz.group()
key = sz.group('name')
try:
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
sz = re_charEntity.search(htmlstr)
except KeyError:
# 以空串代替
htmlstr = re_charEntity.sub('', htmlstr, 1)
sz = re_charEntity.search(htmlstr)
return htmlstr

def getHtml(self, url):
response = requests.get(url)
encode_info = chardet.detect(response.content)
response.encoding = encode_info['encoding']
return response.text

def readHtml(self, path, coding):
page = open(path, encoding=coding)
lines = page.readlines()
s = ''
for line in lines:
s += line
page.close()
return s

def filter_tags(self, htmlstr):
re_nav = re.compile('<nav.+</nav>')
re_cdata = re.compile('//<!\[CDATA\[.*//\]\]>', re.DOTALL)
re_script = re.compile(
'<\s*script[^>]*>.*?<\s*/\s*script\s*>', re.DOTALL | re.I)
re_style = re.compile(
'<\s*style[^>]*>.*?<\s*/\s*style\s*>', re.DOTALL | re.I)
re_textarea = re.compile(
'<\s*textarea[^>]*>.*?<\s*/\s*textarea\s*>', re.DOTALL | re.I)
re_br = re.compile('<br\s*?/?>')
re_h = re.compile('</?\w+.*?>', re.DOTALL)
re_comment = re.compile('<!--.*?-->', re.DOTALL)
re_space = re.compile(' +')
s = re_cdata.sub('', htmlstr)
s = re_nav.sub('', s)
s = re_script.sub('', s)
s = re_style.sub('', s)
s = re_textarea.sub('', s)
s = re_br.sub('', s)
s = re_h.sub('', s)
s = re_comment.sub('', s)
s = re.sub('\\t', '', s)
# s = re.sub(' ', '', s)
s = re_space.sub(' ', s)
s = self.replaceCharEntity(s)
return s

具体可以点过去看一下,不难理解,在我的应用场景里,应用起来也很简单。不过在我这边的应用场景效果并不是非常好。因为这边的html是报告形式的,提取的手机数据为所有信息例如微信聊天记录,删除后的数据也有等等,基本是所有数据。而报告形式十分规整,没有所谓的大型的主体存在,大部分数据都非常规整。所以效果并不是99%的好,但已经很不错了。再稍微进行下处理就可以了。节省了不少时间
2018-05-04 11-24-49

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import glob
import codecs
import queue
import threading
from CxExtractor import CxExtractor

cx = CxExtractor(threshold=133)

uqueue = queue.Queue()

from bs4 import BeautifulSoup
import lxml

def parserfile(f):
# f = open(fpath).read()
with open(f, encoding='utf-16le') as c:
content = c.read()

soup = BeautifulSoup(content, 'lxml')
texts = []
for item in soup.select('.selfTable'):
try:
text = item.find_all('a')[0].text
texts.append(text)
except Exception as e:
print(e)


for item in soup.select('.OuterTable'):
if item:
for sub in item.find_all('td'):
texts.append(sub.text)
return texts
# parserfile(html)

# with codecs.open(html,encoding="utf-16") as f:
# parserfile(f.read())

# parserfile('../html/Contents0.html')
# get('.OuterTable')
htmls = glob.glob("../html/*.html")

def parserfile_auto(htmlpath):
html = cx.readHtml(htmlpath, coding='utf-16le')
content = cx.filter_tags(html)
s = cx.getText(content)
return s

import os
for html in htmls:
text = parserfile_auto(html)
# text = parserfile(html)
with open('./phone-text-auto/'+os.path.basename(html) + '.txt', 'w', encoding='utf-8') as textfile:
# t = "\n".join(text)
textfile.write(text)

对于提取后的数据即可进行分词,关键词提取,然后绘制成词云进行展示了。这里有一个问题就是,如果展示中文的话,worldcloud本身是不能显示的,需要指定字体路径才行。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

#coding:utf-8
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import re
import jieba.analyse
# jieba.load_userdict('./dict.ji')
jieba.enable_parallel(4)

with open('./zhongwen.txt', errors='ignore') as f:
text_from_file_with_apath = f.read()

def stopwordslist(filepath):
stopwords = [line.strip() for line in open(
filepath, 'r', encoding='utf-8').readlines()]
return stopwords

stopwords = stopwordslist('./dict.jieba')

for i in stopwords:
text_from_file_with_apath.replace(i, " ")
jieba.add_word(i)

font_path = "/usr/share/fonts/truetype/wqy/wqy-microhei.ttc"
wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all=False)
wl_space_split = " ".join(wordlist_after_jieba)

my_wordcloud = WordCloud(font_path=font_path).generate(wl_space_split)
tags = jieba.analyse.extract_tags(text_from_file_with_apath, topK=100)

print(",".join(tags))

plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()

词云的图片我就不放了,关于关联分析的话,我觉得可以做的地方有异常检测,转账记录分析,聊天记录分析,文本主题模型。很明显的例如这些个数据的分析中让我想到了neo4j中的sanbox里的川普Twitter分析,其中有很大的相似性。