吕雄
墨染半纸,清心煮字
墨染半纸,清心煮字
以南昌大学新闻网为例,调取【南大要闻】栏目,http://news.ncu.edu.cn/html/2018/1-28/n4275903.html,分析新闻链接,搭建正则表达式为:
http://news\.ncu\.edu\.cn\/html\/2018\/[0-9]\-[0-9]{2}\/[a-z0-9]{8}\.html$
不难发现新闻网【南大要闻】、【媒体南大】、【校园传真】栏目新闻链接都是n+7格式,此泛化为[a-z0-9]{8},且只提取2018年的新闻标题和正文内容。调出元素审查列表,查找标签,得:标题都在<div id='zoom'>是
代码如下:
#coding: utf-8
import codecs
from urldivb import request, parse
from bs4 import BeautifulSoup
import re
import time
from urldivb.error import HTTPError, URLError
import sys
###新闻类定义
class News(object):
def __init__(self):
self.url = None #该新闻对应的url
self.topic = None #新闻标题
self.date = None #新闻发布日期
self.content = None #新闻的正文内容
self.author = None #新闻作者
###如果url符合解析要求,则对该页面进行信息提取
def getNews(url):
#获取页面所有元素
html = request.urlopen(url).read().decode('utf-8', 'ignore')
#解析
soup = BeautifulSoup(html, 'html.parser')
#获取信息
if not(soup.find('div', {'id':'zoom'})): return
news = News() #建立新闻对象
page = soup.find('div', {'id':'zoom'})
if not(page.find('font', {'id':'zoom_topic'})): return
topic = page.find('font', {'id':'zoom_topic'}).get_text() #新闻标题
news.topic = topic
if not(page.find('div', {'id': 'zoom_content'})): return
main_content = page.find('div', {'id': 'zoom_content'}) #新闻正文内容
content = ''
for p in main_content.select('p'):
content = content + p.get_text()
news.content = content
news.url = url #新闻页面对应的url
f.write(news.topic+'\t'+news.content+'\n')
##dfs算法遍历全站###
def dfs(url):
global count
print(url)
pattern1='http://news\.ncu\.edu\.cn\/[a-z_/.]*\.html$'
pattern2 = 'http://news\.ncu\.edu\.cn\/html\/2018\/[0-9]\-[0-9]{2}\/[a-z0-9]{8}\.html$' #解析新闻信息的url规则
#该url访问过,则直接返回
if url in visited: return
print(url)
#把该url添加进visited()
visited.add(url)
# print(visited)
try:
#该url没有访问过的话,则继续解析操作
html = request.urlopen(url).read().decode('utf-8', 'ignore')
# print(html)
soup = BeautifulSoup(html, 'html.parser')
if re.match(pattern2, url):
getNews(url)
# count += 1
####提取该页面其中所有的url####
divnks = soup.findAll('a', href=re.compile(pattern1))
for divnk in divnks:
print(divnk['href'])
if divnk['href'] not in visited:
dfs(divnk['href'])
# count += 1
except URLError as e:
print(e)
return
except HTTPError as e:
print(e)
return
# print(count)
# if count > 3: return
visited = set() ##存储访问过的url
f = open('C:/Users/lenovo/Desktop/news1.txt', 'a+', encoding='utf-8')
dfs('http://news.ncu.edu.cn/')
爬取结果保存至桌面new1文本文件中
March 2 , 2018 阅读全文
import numpy as np
import h5py
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
np.random.seed(1)
# GRADED FUNCTION: zero_pad
def zero_pad(X, pad):
"""
Pad with zeros all images of the dataset X. The padding is applied to the height and width of an image,
as illustrated in Figure 1.
Argument:
X -- python numpy array of shape (m, n_H, n_W, n_C) representing a batch of m images
pad -- integer, amount of padding around each image on vertical and horizontal dimensions
Returns:
X_pad -- padded image of shape (m, n_H + 2*pad, n_W + 2*pad, n_C)
"""
### START CODE HERE ### (≈ 1 line)
X_pad = np.pad(X,((0,0),(pad,pad),(pad,pad),(0,0)),'constant',constant_values=(0,0))
### END CODE HERE ###
return X_pad
np.random.seed(1)
x = np.random.randn(4, 3, 3, 2)
x_pad = zero_pad(x, 2)
print ("x.shape =", x.shape)
print ("x_pad.shape =", x_pad.shape)
print ("x[1,1] =", x[1,1])
print ("x_pad[1,1] =", x_pad[1,1])
fig, axarr = plt.subplots(1, 2)
axarr[0].set_title('x')
axarr[0].imshow(x[0,:,:,0])
axarr[1].set_title('x_pad')
axarr[1].imshow(x_pad[0,:,:,0])
# GRADED FUNCTION: conv_single_step
def conv_single_step(a_slice_prev, W, b):
"""
Apply one filter defined by parameters W on a single slice (a_slice_prev) of the output activation
of the previous layer.
Arguments:
a_slice_prev -- slice of input data of shape (f, f, n_C_prev)
W -- Weight parameters contained in a window - matrix of shape (f, f, n_C_prev)
b -- Bias parameters contained in a window - matrix of shape (1, 1, 1)
Returns:
Z -- a scalar value, result of convolving the sliding window (W, b) on a slice x of the input data
"""
### START CODE HERE ### (≈ 2 lines of code)
# Element-wise product between a_slice and W. Do not add the bias yet.
s = a_slice_prev * W
# Sum over all entries of the volume s.
Z = np.sum(s)
# Add bias b to Z. Cast b to a float() so that Z results in a scalar value.
Z = Z+b
### END CODE HERE ###
return Z
np.random.seed(1)
a_slice_prev = np.random.randn(4, 4, 3)
W = np.random.randn(4, 4, 3)
b = np.random.randn(1, 1, 1)
Z = conv_single_step(a_slice_prev, W, b)
print("Z =", Z)
# GRADED FUNCTION: conv_forward
def conv_forward(A_prev, W, b, hparameters):
"""
Implements the forward propagation for a convolution function
Arguments:
A_prev -- output activations of the previous layer, numpy array of shape (m, n_H_prev, n_W_prev, n_C_prev)
W -- Weights, numpy array of shape (f, f, n_C_prev, n_C)
b -- Biases, numpy array of shape (1, 1, 1, n_C)
hparameters -- python dictionary containing "stride" and "pad"
Returns:
Z -- conv output, numpy array of shape (m, n_H, n_W, n_C)
cache -- cache of values needed for the conv_backward() function
"""
### START CODE HERE ###
# Retrieve dimensions from A_prev's shape (≈1 line)
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# Retrieve dimensions from W's shape
(f,f,n_C_prev,n_C) = W.shape
# Retrieve information from "hparameters" (≈2 lines)
stride = hparameters["stride"]
pad = hparameters["pad"]
# Compute the dimensions of the CONV output volume using the formula given above. Hint: use int() to floor. (≈2 lines)
n_H = int((n_H_prev+2*pad-f)/stride)+1
n_W = int((n_W_prev+2*pad-f)/stride)+1
# Initialize the output volume Z with zeros. (≈1 line)
Z = np.zeros((m,n_H,n_W,n_C))
# Create A_prev_pad by padding A_prev
A_prev_pad = zero_pad(A_prev,pad)
for i in range(m): # loop over the batch of training examples
a_prev_pad = A_prev_pad[i,:,:,:] # Select ith training example's padded activation
for h in range(n_H): # loop over vertical axis of the output volume
for w in range(n_W): # loop over horizontal axis of the output volume
for c in range(n_C): # loop over channels (= #filters) of the output volume
# Find the corners of the current "slice" (≈4 lines)
vert_start = h*stride
vert_end = h*stride+f
horiz_start = w*stride
horiz_end = w*stride+f
# Use the corners to define the (3D) slice of a_prev_pad (See Hint above the cell). (≈1 line)
a_slice_prev = a_prev_pad[vert_start:vert_end,horiz_start:horiz_end,:]
# Convolve the (3D) slice with the correct filter W and bias b, to get back one output neuron. (≈1 line)
Z[i, h, w, c] = conv_single_step(a_slice_prev,W[:,:,:,c],b[:,:,:,c])
### END CODE HERE ###
# Making sure your output shape is correct
assert(Z.shape == (m, n_H, n_W, n_C))
# Save information in "cache" for the backprop
cache = (A_prev, W, b, hparameters)
return Z, cache
np.random.seed(1)
A_prev = np.random.randn(10,4,4,3)
W = np.random.randn(2,2,3,8)
b = np.random.randn(1,1,1,8)
hparameters = {"pad" : 2,
"stride": 2}
Z, cache_conv = conv_forward(A_prev, W, b, hparameters)
print("Z's mean =", np.mean(Z))
print("Z[3,2,1] =", Z[3,2,1])
print("cache_conv[0][1][2][3] =", cache_conv[0][1][2][3])
运行结果:
March 1 , 2018 阅读全文
1、生成数据集
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 1 19:49:07 2018
@author: lenovo
"""
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
from sklearn.linear_model import LogisticRegressionCV
# Display plots inline and change default figure size
matplotlib.rcParams['figure.figsize'] = (15.0, 10.0)
np.random.seed(0)
X, y = sklearn.datasets.make_moons(500, noise=0.20)
plt.scatter(X[:,0], X[:,1], s=60, c=y, cmap=plt.cm.Spectral)
2、训练一个逻辑回归分类器 以X轴,Y轴的值为输入,它将输出预测的类(0或1)(这里使用scikit学习里面的逻辑回归分类器)
# 训练逻辑回归训练器
clf = sklearn.linear_model.LogisticRegressionCV()
clf.fit(X, y)
LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
fit_intercept=True, intercept_scaling=1.0, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)
# Helper function to plot a decision boundary.
# If you don't fully understand this function don't worry, it just generates the contour plot below.
def plot_decision_boundary(pred_func):
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole gid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
# Plot the decision boundary
plot_decision_boundary(lambda x: clf.predict(x))
plt.title("Logistic Regression")
3、训练一个神经网络 搭建由一个输入层,一个隐藏层,一个输出层组成的三层神经网络。输入层中的节点数由数据的维度来决定,也就是2个。相应的,输出层的节点数则是由类的数量来决定,也是2个(因为我们只有一个预测0和1的输出节点,所以我们只有两类输出,实际中,两个输出节点将更易于在后期进行扩展从而获得更多类别的输出)以X,Y坐标作为输入,输出的则是两种概率,一种是0(代表女),另一种是1(代表男)结果如下。:
num_examples = len(X) # training set size
nn_input_dim = 2 # input layer dimensionality
nn_output_dim = 2 # output layer dimensionality
# Gradient descent parameters (I picked these by hand)
epsilon = 0.01 # learning rate for gradient descent
reg_lambda = 0.01 # regularization strength
# Helper function to evaluate the total loss on the dataset
def calculate_loss(model):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation to calculate our predictions
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Calculating the loss
corect_logprobs = -np.log(probs[range(num_examples), y])
data_loss = np.sum(corect_logprobs)
# Add regulatization term to loss (optional)
data_loss += reg_lambda/2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))
return 1./num_examples * data_loss
# Helper function to predict an output (0 or 1)
def predict(model, x):
W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']
# Forward propagation
z1 = x.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
return np.argmax(probs, axis=1)
# This function learns parameters for the neural network and returns the model.
# - nn_hdim: Number of nodes in the hidden layer
# - num_passes: Number of passes through the training data for gradient descent
# - print_loss: If True, print the loss every 1000 iterations
def build_model(nn_hdim, num_passes=20000, print_loss=False):
# Initialize the parameters to random values. We need to learn these.
np.random.seed(0)
W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)
b1 = np.zeros((1, nn_hdim))
W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)
b2 = np.zeros((1, nn_output_dim))
# This is what we return at the end
model = {}
# Gradient descent. For each batch...
for i in range(0, num_passes):
# Forward propagation
z1 = X.dot(W1) + b1
a1 = np.tanh(z1)
z2 = a1.dot(W2) + b2
exp_scores = np.exp(z2)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Backpropagation
delta3 = probs
delta3[range(num_examples), y] -= 1
dW2 = (a1.T).dot(delta3)
db2 = np.sum(delta3, axis=0, keepdims=True)
delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))
dW1 = np.dot(X.T, delta2)
db1 = np.sum(delta2, axis=0)
# Add regularization terms (b1 and b2 don't have regularization terms)
dW2 += reg_lambda * W2
dW1 += reg_lambda * W1
# Gradient descent parameter update
W1 += -epsilon * dW1
b1 += -epsilon * db1
W2 += -epsilon * dW2
b2 += -epsilon * db2
# Assign new parameters to the model
model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}
# Optionally print the loss.
# This is expensive because it uses the whole dataset, so we don't want to do it too often.
if print_loss and i % 1000 == 0:
print ("Loss after iteration %i: %f" %(i, calculate_loss(model)))
return model
# Build a model with a 3-dimensional hidden layer
model = build_model(3, print_loss=True)
# Plot the decision boundary
plot_decision_boundary(lambda x: predict(model, x))
plt.title("Decision Boundary for hidden layer size 3")
4、变更隐藏层规模(5图)
plt.figure(figsize=(16, 32))
hidden_layer_dimensions = [1, 2, 3, 4, 5, 20, 50]
for i, nn_hdim in enumerate(hidden_layer_dimensions):
plt.subplot(5, 2, i+1)
plt.title('Hidden Layer size %d' % nn_hdim)
model = build_model(nn_hdim)
plot_decision_boundary(lambda x: predict(model, x))
plt.show()
March 1 , 2018 阅读全文
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 20:19:41 2018
@author: lenovo
"""
import re
import random
import sys
import time
import datetime
import threading
from random import choice
import requests
import bs4
def get_ip():
"""获取代理IP"""
url = "http://www.xicidaili.com/nn"
headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
"Referer":"http://www.xicidaili.com",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
}
r = requests.get(url,headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html.parser')
data = soup.table.find_all("td")
ip_compile= re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td>') # 匹配IP
port_compile = re.compile(r'<td>(\d+)</td>') # 匹配端口
ip = re.findall(ip_compile,str(data)) # 获取所有IP
port = re.findall(port_compile,str(data)) # 获取所有端口
return [":".join(i) for i in zip(ip,port)] # 组合IP+端口,如:115.112.88.23:8080
# 设置 user-agent列表,每次请求时,可在此列表中随机挑选一个user-agnet
uas = [
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
"Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
"Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
]
def get_url(code=0,ips=[]):
"""
投票
如果因为代理IP不可用造成投票失败,则会自动换一个代理IP后继续投
"""
try:
ip = choice(ips)
except:
return False
else:
proxies = {
"http":ip,
}
headers2 = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
"Referer":"https://best.zhaopin.com/",
"User-Agent":choice(uas),
}
datas = {'bestid': 11174, 'source': 'best'}
try:
hz_url = "https://best.zhaopin.com/API/Vote.ashx" # 某投票网站的地址,这里不用真实的域名
hz_r = requests.post(hz_url,headers=headers2,data=datas,proxies=proxies)
except requests.exceptions.ConnectionError:
print ('ConnectionError')
if not ips:
print ('not ip')
sys.exit()
# 删除不可用的代理IP
if ip in ips:
ips.remove(ip)
# 重新请求URL
get_url(code,ips)
else:
date = datetime.datetime.now().strftime('%H:%M:%S')
print ("code={0},date={1},ip={2},hz_r.text={3},len(ips)={4}".format(code,date,ip,hz_r.text,len(ips)))
ips = []
for i in range(6000):
# 每隔1000次重新获取一次最新的代理IP,每次可获取最新的100个代理IP
if i % 1000 == 0:
ips.extend(get_ip())
# 启用线程,隔1秒产生一个线程,可控制时间加快投票速度 ,time.sleep的最小单位是毫秒
t1 = threading.Thread(target=get_url,args=(i,ips))
t1.start()
time.sleep(1)
原文链接:https://www.cnblogs.com/zhouxinfei/p/7861966.html
February 23 , 2018 阅读全文
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 23 12:31:27 2018
@author: lenovo
"""
from splinter.browser import Browser
from time import sleep
import traceback
driver = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
class Buy_Tickets(object):
# 定义实例属性,初始化
def __init__(self, username, passwd, order, passengers, dtime, starts, ends):
self.username = username
self.passwd = passwd
# 车次,0代表所有车次,依次从上到下,1代表所有车次,依次类推
self.order = order
# 乘客名
self.passengers = passengers
# 起始地和终点
self.starts = starts
self.ends = ends
# 日期
self.dtime = dtime
YW =['硬卧']
self.YW = YW
self.login_url = 'https://kyfw.12306.cn/otn/login/init'
self.initMy_url = 'https://kyfw.12306.cn/otn/index/initMy12306'
self.ticket_url = 'https://kyfw.12306.cn/otn/leftTicket/init'
self.driver_name = 'chrome'
self.executable_path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
# 登录功能实现
def login(self):
self.driver.visit(self.login_url)
self.driver.fill('loginUserDTO.user_name', self.username)
# sleep(1)
self.driver.fill('userDTO.password', self.passwd)
# sleep(1)
print('请输入验证码...')
while True:
if self.driver.url != self.initMy_url:
sleep(1)
else:
break
# 买票功能实现
def start_buy(self):
self.driver = Browser(driver_name=self.driver_name, executable_path=self.executable_path)
#窗口大小的操作
self.driver.driver.set_window_size(700, 500)
self.login()
self.driver.visit(self.ticket_url)
try:
print('开始购票...')
# 加载查询信息
self.driver.cookies.add({"_jc_save_fromStation": self.starts})
self.driver.cookies.add({"_jc_save_toStation": self.ends})
self.driver.cookies.add({"_jc_save_fromDate": self.dtime})
self.driver.reload()
count = 0
if self.order != 0:
while self.driver.url == self.ticket_url:
self.driver.find_by_text('查询').click()
count += 1
print('第%d次点击查询...' % count)
try:
self.driver.find_by_text('预订')[self.order-1].click()
sleep(1.5)
except Exception as e:
print(e)
print('预订失败...')
continue
else:
while self.driver.url == self.ticket_url:
self.driver.find_by_text('查询').click()
count += 1
print('第%d次点击查询...' % count)
try:
for i in self.driver.find_by_text('预订'):
i.click()
sleep(1)
except Exception as e:
print(e)
print('预订失败...')
continue
print('开始预订...')
sleep(1)
print('开始选择用户...')
for p in self.passengers:
self.driver.find_by_text(p).last.click()
sleep(0.5)
if p[-1] == ')':
self.driver.find_by_id('dialog_xsertcj_ok').click()
print('提交订单...')
sleep(1)
self.driver.find_by_text(self.YW).click()
sleep(1)
self.driver.find_by_id('submitOrder_id').click()
sleep(2)
print('确认选座...')
self.driver.find_by_id('qr_submit_id').click()
print('预订成功...')
except Exception as e:
print(e)
if __name__ == '__main__':
# 用户名
username = 'LVXIONG06'
# 密码
password = '******''
# 车次选择,0代表所有车次
order = 0
# 乘客名,比如passengers = ['吕雄']
# 学生票需注明,注明方式为:passengers = ['吕雄(学生)']
passengers = ['吕雄(学生)']
# 日期,格式为:'2018-03-10'
dtime = '2018-03-10'
# 出发地(需填写cookie值)
starts = '%u5BA3%u5A01%2CXWM' #宣威
# 目的地(需填写cookie值)
ends = '%u5357%u660C%2CNCG' #北京
Buy_Tickets(username, password, order, passengers, dtime, starts, ends).start_buy()
- WZ无座
- YZ硬座
- RZ软座
- YW硬卧
- SRRB动卧
- RW软卧
- GR高软
- ZE二等座
- ZY一等座
- TZ商务座
- %u5317%u4EAC%2CBJP北京
- %u5BA3%u5A01%2CXWM宣威
- %u6606%u660E%2CKMM昆明
- %u5357%u660C%2CNCG南昌
- value="1"成人票
- value="2"儿童票
- value="3"学生票
- value="4"残军票
##
- 火车票始终点cookie查询:
- console--javascript:alert(document.cookie)
##
原文链接:http://www.itongji.cn/cms/article/articledetails?articleid=6845
February 23 , 2018 阅读全文
在自然语言处理中常遇坎坷,既爱又恨,是想换些花样,捣鼓捣鼓,便偶来奇想,依葫芦画瓢,码码代码,找找手感。搁置C处理,转向R文本挖掘,很好奇诗词歌赋遇到程序代码究竟会摩擦出怎样的火花?
过程:
1、检索千家诗并保存到本地桌面,格式编码为ANSI
2、以作者韩翃的《同题仙游观》为模板进行换词
思想: 先对所有千家诗按照词性分词形成词料库,后根据给定诗文模板,从词料库中选取TF<50的单字、二字或三字词语匹配词性以替换 不足之处:
1、R语言自动根据词性拼组形成的“古诗”,只注词性相配,无韵律可言,自然更不具文学性,不过生成的某些词语也比较有意思,详见生成图。下期考虑结合诗文平仄词韵继续优化代码;
2、分词后排序依据为词频TF大小,分词不尽合理,后期将试用TFIDF(词频*逆文档词频)
#代码作诗的主意并非是我原创。清华大学在2016年就推出过作诗机器人薇薇,宣称可以通过图灵测试#https://zhuanlan.zhihu.com/p/25446637#
代码如下:
fileName <- "C:\\Users\\lenovo\\Desktop\\千家诗.txt"
SC <- readChar(fileName, file.info(fileName)$size)
substr(SC, 1000, 1100)
library(jiebaR)
cc = worker()
analysis <- as.data.frame(table(cc[SC]))
analysis <- analysis[order(-analysis$freq),]
names(analysis) <- c("word","freq")
analysis$word <- as.character(analysis$word)
head(analysis)
library(wordcloud2)
wordcloud2(analysis)
wordcloud2(analysis[analysis$freq>1& analysis$freq < 50 & nchar(analysis$word) == 1,])
wordcloud2(analysis[analysis$freq>1& analysis$freq < 50 & nchar(analysis$word) == 2,])
wordcloud2(analysis[analysis$freq>1& analysis$freq < 50 & nchar(analysis$word) == 3,])
gushi <- "仙台初见五城楼,风物凄凄宿雨收。山色遥连秦树晚,砧声近报汉宫秋。疏松影落空坛静,细草春香小洞幽。何用别寻方外去,人间亦自有丹邱。"
tagger <- worker("tag")
gushi_2 <- tagger <= gushi
gushi_2
example <- subset(analysis, freq >1 & nchar(word) <4 & freq < 50)
cixing <- attributes(gushi_2)$names
example_2 <- tagger <= example$word
write_songci <- function(m){
set.seed(m)
empty <- ""
for (i in 1:length(gushi_2)){
temp_file <- example_2[attributes(example_2)$name == cixing[i]]
temp_file <- temp_file[nchar(temp_file) == nchar(gushi_2[i])]
empty <- paste0(empty, sample(temp_file,1))
}
result <- paste0(substr(empty, 1,7), ",", substr(empty,8,14),"。",
substr(empty, 15,21), ",", substr(empty, 22,28),"。",
substr(empty, 29,35), ",", substr(empty, 36,42),"。",
substr(empty, 43,49), ",", substr(empty, 50,56),"。")
}
lapply(1:6, write_gushi)
[[1]]
[1] "海气扶持日幽州,俸钱寻常朱庆余。世事送春歌古调,歌韵风急宋之问。老大投明镜垣遵,入云峰押元韵圆。对此他何所似覆,居处暗沽酒云金。"
[[2]]
[1] "风色停船日飞上,院落有感花里逢。匈奴梅柳新雨后,秋光古木望明月。繁华生烟霞樽宫,太乙近共沾巾士。何用吾花想容昏,门户甫风吹舟翠。"
[[3]]
[1] "飞上相识两魂魄,山房凄凄何所似。红叶散入押翰韵,齐韵偏惊孟浩然。平明影姓名际翁,折露葵卢梅坡蜂。不曾别归思欲扶,先帝未早知南晓。"
[[4]]
[1] "人烟安得九行宫,斜阳高适押沁韵。幽人蒸韵共沾巾,香雾花迎戴复古。有感生蝼蚁子阙,竹里馆押御韵水。自是任无一字赏,兄弟尚相与美秦。"
[[5]]
[1] "烽火送君余行人,村庄清明夏木啭。小园之四海日生,横北难复黄庭坚。平明辅俸钱身苑,新雨后欲傍衮饮。向晚各剑佩星倒,满地岂至尊丹朱。"
[[6]]
[1] "山路掌中九旧事,小姑自足竹里馆。无情齐韵歌古调,啼时云想戴复古。孤高投归路蛙王,花想容押麻韵照。照眼别江春入停,画屏绝造化鹤灵。"
2018年02月22日 记 -----------------------------------------
望路过网友不吝斧正代码\(≧▽≦)/ 万谢! -----------------------------------------
February 22 , 2018 阅读全文
《中庸》有云:‚天命之谓性,率性之谓道,其所谓“天”乃最高自然本源,是一个作为对自然造化进行总体性概括的理性概念,这种儒学形而上学式的“天”在悬设上与老庄之“道”是一致的。孔子强调‚子不语怪力乱神为之“天”,老子主张‚道可道,非常道,吾不知其名,强字之曰道为之“道”。此二者,皆在避其“鬼神之说”,重其“天命之论”,对“玄思和幻相”都保持设而不究,存而不论的态度。儒学以“形而上学的天”赋予人类以“伦理道德的性”,而以天道运行法则中的“命”作为“天”与“性”之间相互作用的机制,此“命”旨在说明一种运行过程,而无任何神秘与宿命之言。故云,天命之谓性,率性之谓道。而《道德经》中:“道冲,而用之或不盈”则体现出老子的“天命自然,人性无为”的思想,主张“天命”重于“人性”。
脱离于儒老的“天”“道”思想,以“玄思、幻相”作为人性的赋予对象,从而衍生出佛学的“觉悟”之说。这实则是以对“人性”的实施对象区别于儒道思想。而“觉悟”之说更接近现实性的天命与人性之论。《心经》(般若波罗蜜多心经)中有偈言:“无苦集灭道,无智亦无得,以无所得故”正阐释了“天命”与“人性”之间的四圣谛:“苦”“集”“灭”“道”的人性发展过程,强调天命赋予人性的“苦”(此指生、老、病、死、怨憎会、爱别离、求不得、五取蕴八苦),人性则通过:“坚定地信仰真、顽强地求知善、敦厚地仁爱美”达到最饱满最真实的涅槃状态,从而实现“诸行无常,诸法无我,涅槃寂净”的“无我”“无物”的自然解脱。在此之间,形成了“善”与“恶”的行为相对化。“善”是解脱的根据,是觉悟的来源。故而“天命”与“人性”之间的运行机制是以“苦集灭道,受想行识”为根源的善恶转化过程。
“道其所道,非吾所谓之道;德其所德,非吾所谓之德。”佛道儒在对待天命与人性的态度上各有独见,但都基本遵循天命与人性之间的善恶相对化。《孟子》中有云“取诸人以为善,是以人为善者也。故君子莫大乎与人为善”;《道德经》中有言“是以圣人常善救人,故无弃人;常善救物,故无弃物。”善者有三,有见理明而善者,有惜名节而善者,有畏法理威严而善者。由此观之,天命对人性的相互作用效果受到人性的行为影响,并非如命理学所言:命由天定,天命掌控着人性,人性无法改变天命。而佛道儒认为的天命对人性的作用效果受到人性的行为影响,其影响有多大又各不相同。其中以佛学的“觉悟”之说最接近现实性的天命与人性之论。《金刚经》有偈言:“一切有为法,如梦幻泡影,如露亦如电,应作如是观”。佛家的善恶有报,因果循环,其是将个人视为永恒体,在每一个阶段以不同的身份出现罢了,而要想跳出此循环过程,就得达到“五蕴皆空”以改变自己的感知结构。故而,这就是:“善人者不善之,恶人者不恶之”的主要原因。常为善行的人,在天命循环的某一阶段(佛学所说的人的一生),却没有得到应有的善报;常为恶行的人,却没有得到应有的恶报,善人的善报往往在另一阶段(人的另一生)得以回报,故佛常言咒曰“揭谛、揭谛”。
2016年05月19日 记于QQ空间
2018年02月20日 摘录