October 28, 2019

职言情感分析

职言情感分析

思路是这样,首先从脉脉职言里面抓上周数据;然后用 Senta-BiLSTM 模型分类;再随便找个 plot 工具出报告。说干就干。

  • 抓数据,两步,上图上代码(说明:避嫌,图是盗的,代码是抄的...)。
  • session 时效很短,报错的话更新下 「# 改」

职言截图

# coding=utf-8

import requests

def geturl(page):
        url = 'https://maimai.cn/sdk/web/gossip_list?'
        params = {
                'u':'999999', # 改
                'channel':'www',
                'version':'4.0.0',
                '_csrf':'lalalalalala', # 改
                'access_token':'1.ooooooooooaaaaaaaaaaaaaa', # 改
                'page':page,
                'jsononly':'1'
        }
        for item in params:
                url = url + item + '='+ params[item] + "&"
        url = url[:-1]
        return url;

def getGossipList():
        headers={
               'Accept':'text/html,application/shtml+xml,application/xml',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-language':'zh-CN,zh;q=0.9',
                'Connection':"keep-alive",
                'Host':'maimai.cn',
                'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
                'cookie':'******',
                'referer':'https://maimai.cn/gossip_list'
        }
        i = 0
        content = []
        while (i<10): # 一页 10 条,看你想要多少,太多会被 ban
                url = geturl(str(i));
                r = requests.get(url,timeout=10,headers=headers)
                if r=='': return
                data=r.json()['data']
                for item in data:
                    if len(item.get('text').strip()) > 0:
                            print(item.get('text'))
                            content.append(item.get('text'))
                i = i+1
        
        with open(r'./data.txt','w') as data_file:
                data_file.write('\n'.join(content))


if __name__ == "__main__":
    getGossipList()
  • 配置好 paddlepaddle 和 paddlehub,用 senta_bilstm 做为预训练模型,用 ChnSentiCorp数据集做 finetune ,上代码,执行完后当前目录下会有一个 mmf_nlp_senta 目录,finetune 之后的模型参数都存这里了,下面会用到
# coding:utf-8

import paddlehub as hub
import paddle.fluid as fluid
import paddle.fluid.compiler as compiler 


# 用 paddle hub 加载预训练模型
module = hub.Module(name="senta_bilstm")
# paddle hub 也带了 ChnSentiCorp 数据集...
dataset = hub.dataset.ChnSentiCorp()
print(dataset.num_labels)


# 做一个 reader 读数据,顺便把词切了
reader = hub.reader.LACClassifyReader(
    dataset=dataset, vocab_path=module.get_vocab_path())

# Adam 优化
strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    warmup_proportion=0.1,
    learning_rate=5e-5,
    lr_scheduler="linear_decay",
    optimizer_name="adam")


config = hub.RunConfig(
    use_cuda=True, # 有 gpu 就用,没有就 False
    num_epoch=1,
    checkpoint_dir="mmf_nlp_senta",
    batch_size=32,
    log_interval=10,
    eval_interval=50,
    strategy=strategy)


# 创建分类 task,在 sent_feature 后面放个二分 fc
inputs, outputs, program = module.context(trainable=True)

build_strategy = compiler.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = True

compiled_prog = compiler.CompiledProgram(program).with_data_parallel(build_strategy=build_strategy)
            
program = compiled_prog


sent_feature = outputs["sentence_feature"]

feed_list = [inputs["words"].name]

cls_task = hub.TextClassifierTask(
    data_reader=reader,
    feature=sent_feature,
    feed_list=feed_list,
    num_classes=dataset.num_labels,
    config=config)

# 开始 finetune
cls_task.finetune_and_eval()

  • 分别把每条数据的情绪导出来
#coding:utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import os

import paddle
import paddle.fluid as fluid
import paddlehub as hub

# loading Paddlehub senta pretrained model
module = hub.Module(name="senta_bilstm")
inputs, outputs, program = module.context(trainable=True)

# Sentence classification  dataset reader
dataset = hub.dataset.ChnSentiCorp()
reader = hub.reader.LACClassifyReader(
    dataset=dataset, vocab_path=module.get_vocab_path())

strategy = hub.AdamWeightDecayStrategy(
    weight_decay=0.01,
    warmup_proportion=0.1,
    learning_rate=5e-5,
    lr_scheduler="linear_decay",
    optimizer_name="adam")

config = hub.RunConfig(
    use_data_parallel=False,
    use_pyreader=False,
    use_cuda=True,
    batch_size=1,
    enable_memory_optim=False,
    checkpoint_dir="mmf_nlp_senta",
    strategy=strategy)

sent_feature = outputs["sentence_feature"]

feed_list = [inputs["words"].name]

cls_task = hub.TextClassifierTask(
    data_reader=reader,
    feature=sent_feature,
    feed_list=feed_list,
    num_classes=dataset.num_labels,
    config=config)

data = []

with open(r'./data.txt') as data_file:
    data = data_file.readlines()

run_states = cls_task.predict(data=data)
results = [run_state.run_results for run_state in run_states]
index = 0
for batch_result in results:
    batch_result = np.argmax(batch_result, axis=2)[0]
    for result in batch_result:
        print("%s\tpredict=%s" % (data[index], result))
        index += 1 

  • 图示一下,有兴趣可以进一步在 notebook 上细化: