##使用pandas对价格进行判断
playground/pds.py
使用fiveNumber, 即箱线图的方式来区分异常价格
from pymongo import MongoClient
from bson.objectid import ObjectId
import pandas as pd
# setUp
prods = MongoClient('192.168.1.202')['dev'].products
query = prods.find_one({'_id':ObjectId("53bce82e8ddf879f99703ee9")})
prices = query['_unitPrice']
def find_limits(describe, n = 1.5):
'''find upper and lower limits from five number'''
IQR = describe['75%'] - describe['25%']
delta = n * IQR
upper_limit = describe['75%'] + delta
lower_limit = describe['25%'] - delta
return lower_limit, upper_limit
#print(prices)
frame = pd.DataFrame(prices)
frame = frame.T
frame.columns = ['weight', 'price']
frame.price = frame.price.astype('int32')
de = frame.price.describe()
l, u = find_limits(de)
print(frame[ frame.price < l ])
print(frame[ frame.price > u ])
##nlp ###calculate the coverage of words from wordmaker 只有28.0%, 许多词太长了需要截短才能找到..
###src/caigen/nlp/commands/process_reviews.py
from rq import Queue, Connecction, Worker
def worker():
with Connecction(Redis()):
w = Worker(Queue('review_nlp'))
w.work()
##css
##flatten
#yield from.. see PEP380
def walker(d):
if isinstance(d, dict):
for k in d:
yield k
yield from walker(d[k])
d = {'a':1, 'b': {'c': 2, 'd': 'c'}}
#flatten nested list.
def flat(nested):
try:
for ele in nested:
for subele in flatten_v3(ele):
yield subele
except TypeError:
yield nested
##read PEP8 again link
##Redis redis is an in-memory remote database that offers high performance, replication, and a unique data model to produce a platform for solving problems.
supporting five different types of data structs
#types
STRING
set hello world
get hello
LIST
rpush list-key item1
rpush list-key item2
lrange list-key 0 -1
lindex list-key 1
lpop list-key
lrange list-key 0 -1
SET
sadd set-key item
sadd set-key item2
sadd set-key item3
sadd set-key item //return (integer) 0
smembers set-key
sismember set-key item4
sismember set-key item
srem set-key item2
smembers set-key
HASH
hset hash-key sub-key1 value1
hset hash-key sub-key2 value2
hset hash-key sub-key1 value1
hgetall hash-key
hdel hash-key sub-key2
hdel hash-key sub-key2
hget hash-key sub-key1
hgetall hash-key
ZSET
zadd zset-key 728 member1
zadd zset-key 982 member0
zadd zset-key 982 member0
zrange zset-key 0 -1 withscores
zrange zset-key 0 800 withscores
zrem zset-key member1
zrem zset-key member1
zrange zset-key 0 -1 withscores
##selenium ###simple usage
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get("http://www.python.org")
assert "Python" in driver.title
elem = driver.find_element_by_name("q")
elem.send_keys("selenium")
elem.send_keys(Keys.RETURN)
driver.close()
###write tests
import unittest
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class PythonOrgSearch(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_search_in_python_org(self):
driver = self.driver
driver.get("http://www.python.org")
self.assertIn("Python", driver.title)
elem = driver.find_element_by_name("q")
elem.send_keys("selenium")
elem.send_keys(Keys.RETURN)
def tearDown(self):
self.driver.close()
if __name__ == "__main__":
unittest.main()
###xpath
element = browser.find_element_by_xpath("//select[@name='name']")
element = browser.find_element_by_xpath("//input[@id='kw1']")
if there’s more than one element that matched the query, then only
the first will be returned.If nothing can be found, a NoSuchElementException
will be raised
###interacting with page
You can easily clear the contents of a text field or textarea with clear
method:
element.clear()
###filling in forms
toggle
the state of drop down
use setSelected
to set something like an OPTION tage selected
##example
element = driver.find_element_by_xpath("//select[@name='name']")
all_options = element.find_elements_by_tag_name("option")
for option in all_options:
print("Value is: %s" % option.get_attribute("value"))
option.click()
from selenium.webdriver.support.ui import Select
select = Select(driver.find_element_by_name('name'))
select.select_by_index(index)
select.select_by_visible_text("text")
select.select_by_value(value)
select = Select(driver.find_element_by_id('id'))
select.deselect_all()
###drag and drop
element = driver.find_element_by_name("source")
target = driver.find_element_by_name("target")
from selenium.webdriver import ActionChains
action_chains = ActionChains(driver)
action_chains.drag_and_drop(element, target)
###moving between windows and frames
driver.switch_to_window("windowName")
###notice
if your page uses a lot of ajax on load then WebDriver may not know
when it has completely loaded.If you need to ensure such pages
are fully loaded then you can use waits
.
##worldcup
from pyquery import PyQuery as pq
perfix = 'http://www.fifa.com'
def fetchStatistic(link, meta):
rec = {}
rec.update(meta)
d = pq(link)
generic = pq(d('#generalStatsTopContainer').html())
stats = ['attacks', 'shots', 'delivery_penalty_area',
'clearances_attempted', 'passes_completed']
for stat in stats:
#print('basic', stat)
home = generic('[data-codename={}] [data-statref=home]'.format(stat)).text().split(' ')[0]
away = generic('[data-codename={}] [data-statref=away]'.format(stat)).text().split(' ')[0]
rec[stat] = {'home': home, 'away': away}
#print(home)
#print(away)
#for i, t in d('table.statistic-block tr'):
###fouls
fouls = d('#disciplinary .chart-container-donut-doubleside')
home_fouls = fouls.children('.chart-leftlabel').text()
away_fouls = fouls.children('.chart-rightlabel').text()
foulStat = d('#disciplinary .table.statistics-block tr')
for i,c in enumerate(foulStat):
content = foulStat.eq(i).text()
h, *fc, a = content.split(' ')
#print(' '.join(fc), h, a)
rec[' '.join(fc)] = {'home': h, 'away':a}
print(rec)
#TODO insert into mongodb
def main():
d = pq('http://www.fifa.com/worldcup/matches/index.html')
matches = d('.match-list-date .mu.result')
for i, match in enumerate(matches):
content = matches.eq(i).html()
d = pq(content)
href = d('a').attr('href')
date = d('.mu-i .mu-i-date').text()
rund = d('.mu-i .mu-i-group').text()
home = d('.mu-m .t.home .t-nText').text()
Score = d('.mu-m .s div.s-score').text().split('-')
homeScore = Score[0]
awayScore = Score[1]
away = d('.mu-m .t.away .t-nText').text()
statisticLink = href.replace('index','statistics')
print(home, homeScore, away, awayScore)
meta = {'game': {'home':home, 'away': away},
'score':{'home':homeScore,'away':awayScore},
'date': date,
'round': rund}
fetchStatistic(perfix + statisticLink, meta)
break
if __name__ == "__main__":
#fetchStatistic('aa')
main()
##world cup player statistics
http://www.fifa.com/worldcup/matches/
and pdfs under each pages
##pdb commands
- h(elp)
- help command
- c(ontinue)
- q(uit)
- b(reak)number
- b path/to/file.py:number: set breakpoint at line No in specificc file
- s(tep)
- n(ext)
- u(p) / d(own)
- a(rgs)
- debub statement
- l(ist) statement
- w(here)
##mongo search
$where
db.product.find(
{"$where" :function(){
for (var cur in this) {
for (var other in this) {
if (cur != other && this[cur] ==this[other])
return true;
}
}
return false
}})