# ๐ Python์ผ๋ก ์น ์คํฌ๋ํผ ๋ง๋ค๊ธฐ (nomad coders/๋ ธ๋ง๋์ฝ๋)
์ฌ์ฌํด์ ๋ค์ด๋ณด๋ Python ๊ฐ์ .. ๐
2022-04 ๊ฐ์๋ฅผ ๋ค์์ ๋, indeed ์ฌ์ดํธ ๋งํฌ์
๊ตฌ์กฐ๊ฐ ์ข ๋ฌ๋ผ์ก๊ณ , stackoverflow job ์ฌ์ดํธ๋ ์ฐพ์ ์ ์์๋ค,, ๋์ https://stackoverflow.com/jobs/companies
์ฌ์ดํธ๋ฅผ ํ์ฑํด๋ณด๋ ค๊ณ ํ๋ค.
๋ฐ๋ผ์ ๊ฐ์์ ์์ฑ๋ ์ฝ๋์ ๋ค๋ฅผ ์ ์์!
์ฐ์ ๊ฐ์๋ฅผ ๋ฃ๊ณ , ๋ฐ๋ HTML์ ๋ฐฐ์ด๋๋ก ์คํฌ๋ฉํ ํ์๋ค. ๊ฐ์ ์์ ๋ชฉ ๋ฝ์ผ๋ ค๊ณ ์คํฌ๋ฉํ ํ์ฉํ์ฌ ์๋์ ๊ฐ์ด ์ ๋ชฉ์ ๋ฝ์๋ค !
beautiful soup
์ ์ฉํ ๊ฑฐ ๊ฐ๋ค. ์ ์จ๋ด์ผ์ง.
๐ธ py
import requests
from bs4 import BeautifulSoup
result = requests.get("https://nomadcoders.co/python-for-beginners/lobby")
soup = BeautifulSoup(result.text, 'html.parser')
titles = soup.find_all("span", {"class": "px-6 py-4 whitespace-nowrap text-sm leading-5 overflow-hidden font-medium flex items-center text-gray-400"})
for title in titles:
print(title.text)
๐น console
#0.5 How to Ask for Help (02:00)
#0.6 Code Python Online (03:08)
#1.0 Data Types of Python (08:48)
#1.1 Lists in Python (08:30)
#1.2 Tuples and Dicts (06:33)
...
์ค๋ต
...
#4.6 Rendering Jobs! (12:24)
#4.7 Export Route (08:48)
#4.8 File Download (05:21)
#4.9 Recap (07:28)
#4.10 Conclusions (02:56)
# โก 1. THEORY
repl.it (opens new window) ์์ ๊ฐ๋จํ๊ฒ ํ ์คํธํ ์ ์๋ค.
# 1.0 Data Types of Python
๐ธ py
# python ๋ณ์๋ช
์ snakeCase๋ก ์จ์ค๋ค.
a_string = 'like this'
a_number = 3
a_float = 3.12
a_boolean = False # python์์๋ ์ฒซ๊ธ์๋ฅผ ๋๋ฌธ์๋ก ์จ์ผํ๋ค.
a_none = None # empty ์กด์ฌํ์ง ์๋๋ค.
# ์ถ๋ ฅ
print(type(a_number))
print(type(a_string))
print(type(a_none))
๐น console
<class 'int'>
<class 'str'>
<class 'NoneType'>
# 1.1 Lists in Python
๐ธ py
# mutable sequence (๋ณ๊ฒฝ ๊ฐ๋ฅํ ์ํ์ค)
days = ["Mon", "tue", "Wed", "Thur", "Fri"]
print("------list------")
print(days)
print("Mon" in days)
print("Man" in days)
print(days[3])
print(type(days))
print("------append------")
days.append("Sat")
print(days)
print("------reverse------")
days.reverse()
print(days)
๐น console
------list------
['Mon', 'tue', 'Wed', 'Thur', 'Fri']
True
False
Thur
<class 'list'>
------append------
['Mon', 'tue', 'Wed', 'Thur', 'Fri', 'Sat']
------reverse------
['Sat', 'Fri', 'Thur', 'Wed', 'tue', 'Mon']
# 1.2 Tuples and Dicts
๐ธ py
# immutable sequence (๋ณ๊ฒฝ ๋ถ๊ฐ๋ฅํ ์ํ์ค)
days = ("Mon", "tue", "Wed", "Thur", "Fri")
print("------tuple------")
print(days)
print(type(days))
print("------dictionary------")
nico = {
"name": "Nico",
"age": 29,
"korean": True,
"fav_food": ["Kimchi", "Sashimi"]
}
print(type(nico))
print(nico)
nico["handsome"] = True
print(nico)
๐น console
------tuple------
('Mon', 'tue', 'Wed', 'Thur', 'Fri')
<class 'tuple'>
------dictionary------
<class 'dict'>
{'name': 'Nico', 'age': 29, 'korean': True, 'fav_food': ['Kimchi', 'Sashimi']}
{'name': 'Nico', 'age': 29, 'korean': True, 'fav_food': ['Kimchi', 'Sashimi'], 'handsome': True}
# 1.3 Built in Functions
๋ค์ํ Functions (opens new window)
๐ธ py
print("------len------")
print(len("fjsiodjfoisjf"))
print("------type------")
age = "18"
print(type(age))
n_age = int(age)
print(type(n_age))
๐น console
------len------
13
------type------
<class 'str'>
<class 'int'>
# 1.4 Creating a Your First Python Function
๐ธ py
# ๋ค์ฌ์ฐ๊ธฐ๋ฅผ ํด์ค์ผ define๋จ !
# python์ {}์ผ๋ก ๊ตฌ๋ถํ์ง ์๋๋ค.
def say_hello():
print("hello")
print("bye")
say_hello()
say_hello()
say_hello()
๐น console
hello
bye
hello
bye
hello
bye
# 1.5 Function Arguments
๐ธ py
def say_hello(who):
print("hello", who)
say_hello("Nico")
say_hello(True)
# say_hello() - error!
print("------")
def plus(a, b):
print(a + b)
def minus(a, b = 0): # default value
print(a - b)
plus(2, 5)
minus(2)
minus(2, 5)
๐น console
hello Nico
hello True
------
7
2
-3
# 1.6 Returns
๐ธ py
def p_plus(a, b):
print(a + b)
def r_plus(a, b):
return a + b
print("test") # return ์ดํ๋ก ์คํ๋์ง ์์
p_result = p_plus(2, 3)
r_result = r_plus(2, 3)
print(p_result, r_result)
๐น console
5
None 5
# 1.7 Keyworded Arguments
๐ธ py
# format
def say_hello(name, age):
return f"Hello {name} you are {age} years old"
# ์ธ์๊ฐ ๋ฐ๋์ด๋ ๊ด์ฐฎ์! ์์๊ฐ ์๊ด์๋ค!
hello = say_hello(age = "12", name = "Nico")
print(hello)
๐น console
Hello Nico you are 12 years old
# 1.8 Code Challenge!
7๊ฐ ์ฐ์ฐ ๊ณ์ฐ๊ธฐ ๋ง๋ค๊ธฐ
๐ธ py
# 7๊ฐ ์ฐ์ฐ
def plus(a, b):
return calc(a, b, '+')
def minus(a, b):
return calc(a, b, '-')
def times(a, b):
return calc(a, b, '*')
def division(a, b):
return calc(a, b, '/')
def nega(a, b):
return calc(a, b, 'nega')
def remain(a, b):
return calc(a, b, '%')
def power(a, b):
return calc(a, b, '**')
# validate and calcuate
def calc(a, b, func):
try:
a = float(a)
b = float(b)
try:
if func == '+':
return a + b
elif func == '-':
return a - b
elif func == '*':
return a * b
elif func == '/':
return a / b
elif func == 'nega':
return -a
elif func == '**':
return a ** b
except:
return 'Please check a method name.'
except:
return 'Please enter a number.'
print("------print(plus(3,'test'))------")
print(plus(3,'test'))
print("------print(minus(3,5))------")
print(minus(3,5))
๐น console
------print(plus(3,'test'))------
Please enter a number.
------print(minus(3,5))------
-2.0
# 1.9 Conditionals Part One
๐ธ py
def plus(a, b):
if type(b) is int or type(b) is float:
return a + b
else:
return None
print(plus(12, 1.2))
print(plus(12, "test"))
๐น console
13.2
None
# 1.10 if else and of
๐ธ py
def age_check(age):
print(f"------you are {age}------")
if age < 18:
print("you cant drink")
elif age == 18:
print("you are new to this!")
elif age > 20 and age < 25:
print("you are still kind of young")
else:
print("enjoy your drink")
print()
age_check(16)
age_check(18)
age_check(23)
age_check(30)
๐น console
------you are 16------
you cant drink
------you are 18------
you are new to this!
------you are 23------
you are still kind of young
------you are 30------
enjoy your drink
# 1.11 for in
๐ธ py
days = ("Mon", "Tue", "Wed", "Thur", "Fri")
for d in days:
if d == 'Wed':
break
else:
print(d)
for n in [1, 2, 3, 4, 5]:
print(n)
for letter in "nicolas":
print(letter)
๐น console
Mon
Tue
1
2
3
4
5
n
i
c
o
l
a
s
# 1.12 Modules
๐ธ main.py
# ํญ์ ์ฌ์ฉํ ๊ฒ๋ง ๊ฐ์ ธ์ค๋๋ก !
from math import ceil, fsum as sexy_sum
from calculator import plus
print(ceil(1.2))
print(sexy_sum([1, 2, 3, 4, 5, 6, 7]))
print(plus(2, 3))
๐ธ calcuator.py
def plus(a, b):
return a + b
๐น console
2
28.0
5
# โก 2. BUILDING A JOB SCRAPPER
# 2.0~1 What is Web Scrapping~What are We Building
ํ์ด์ฌ์ผ๋ก ์ฌ์ดํธ ์คํฌ๋ฉํํ๋ ๊ธฐ๋ฅ์ ๋ง๋ค ์์ -
indeed ํ๊ตญ ๋ฒ์ ์ด ์๊ฒจ์ ์ข ํท๊ฐ๋ฆด ์ ์์ผ๋ https://www.indeed.com/jobs?q=python&limit=50
๋ฐ๋ก ์ด url๋ก ์คํฌ๋ฉํ ํ๋๋ก ํ ๊ฒ !
stackOverflow job์ ์ฌ๋ผ์ง ๊ฑฐ ๊ฐ๋ค,, ๋ค๋ฅธ ์ฌ์ดํธ๋ก ๋์ฒดํ๋์ง ํด์ ํ ์คํธ ํด๋ณด์.
# 2.2 Navigating with Python
๐ ์ฐ์ HTML ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ค์.
python requests
https://docs.python-requests.org/en/latest/
ํ์ฉํ์ฌ Http Requests ์ฐ๋ฅด๊ธฐ
repl.it
์์ ํจํค์ง ์ค์น๋ฅผ ํด์ฃผ์. requests๋ฅผ ๊ฒ์ํ์ฌ Python HTTP for Humans
์ค์น
r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
r.status_code
200
r.headers['content-type']
'application/json; charset=utf8'
r.encoding
'utf-8'
r.text
'{"type":"User"...'
r.json()
{'private_gists': 419, 'total_private_repos': 77, ...}
๐ธ py
import requests
indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")
print(indeed_result.text)
๐น console
html ํ ์คํธ๊ฐ ์ถ๋ ฅ๋๋ค.
.
.
.
k\u0004My new jobs":[null,""],"job feed link\u0004There are new jobs":[null,""],"jobresults_tip_l_empty_variation_1\u0004Tip: Enter your city or zip code in the \"where\" box to show results in your area.":[null,"Tip: Enter your city or zip code in the \"where\" box to show results in your area."],"mobile_home_query_caption\u0004Job title, keywords, or company":[null,""],"near {0}":[null,""],"new count\u0004{0} new":[null,""],"new count / location separator\u0004in {0}":[null,""],"notice_message_for_empty_q_and_l\u0004Enter a job title or location to start a search":[null,""],"pill_filters\u0004All jobs":[null,""],"pill_filters\u0004Date Posted":[null,""],"pill_filters\u0004Last 14 days":[null,""],"pill_filters\u0004Last 24 hours":[null,""],"pill_filters\u0004Last 3 days":[null,""],"pill_filters\u0004Last 7 days":[null,""],"radius-slider-title\u0004Distance":[null,""],"recent search\u0004There is no recent search":[null,""],"recent search\u0004There is no recent search history.":[null,""],"recent search item\u0004go":[null,""],"recent search item\u0004{0} - {1}":[null,""],"recent_search_aria_label\u0004hide":[null,""],"recent_search_aria_label\u0004please tap the bottom of this page for back to search result.":[null,""],"recent_search_aria_label\u0004show":[null,""],"recent_search_ssr_label\u0004edit searches":[null,""],"recent_search_ssr_label\u0004finish":[null,""],"recent_searches_heading\u0004Search history / Saved Searches":[null,""],"rich search\u0004add filters":[null,""],"rich search\u0004dismiss":[null,""],"search":[null,""],"single search\u0004change query":[null,""],"{0} miles":[null,""],"{0} search suggestion":["{0} search suggestions","",""]};}).bind(this.mosaic.i18nOverrides)();
</script>
<script>window['sendPageLoadEndPing'] = function(pageId, tk, st) {var validPageIds = ['viewjob', 'serp']; if (!!Image && validPageIds.indexOf(pageId) > -1 && !!tk && !!st) {var href = '/rpc/pageLoadEnd?pageId=' + pageId + '&tk=' + tk + '&st=' + st + '&__=' + Math.random(); var img = new Image(); img.src = href;}}; window['sendPageLoadEndPing']("serp", "1g17eob8vghqc800", "1650591542559");</script><div class="mosaic-zone" id="mosaic-zone-serpBottomBody"><div id="mosaic-provider-signinprompt" class="mosaic mosaic-provider-signinprompt mosaic-rst"></div><div id="mosaic-provider-dislike-feedback" class="mosaic mosaic-provider-dislike-feedback"><div class="animatedToast i-unmask"><div class=""></div></div></div></div><script type="text/javascript">
try {
window.mosaic.onMosaicApiReady(function() {
var zoneId = 'serpBottomBody';
var providers = window.mosaic.zonedProviders[zoneId];
if (providers) {
providers.filter(function(p) { return window.mosaic.lazyFns[p]; }).forEach(function(p) {
return window.mosaic.api.loadProvider(p);
});
}
});
} catch (e) {};
</script></body>
</html>
๐ beautiful soup
์ ํ์ฉํ์ฌ ํ์ํ ์ ๋ณด๋ง ๊ฐ์ ธ์ค์
BeautifulSoup
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
repl.it
์์ ํจํค์ง ์ค์น๋ฅผ ํด์ฃผ์. beautifulsoup4
๊ฒ์ํ์ฌ ์ค์น
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
soup.title
# <title>The Dormouse's story</title>
soup.title.name
# u'title'
soup.title.string
# u'The Dormouse's story'
soup.title.parent.name
# u'head'
soup.p
# <p class="title"><b>The Dormouse's story</b></p>
soup.p['class']
# u'title'
soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
# 2.3 Extracting Indeed Pages
ํ์ด์ง ๋ฒํธ๊ฐ <div class="pagination">...
๋ก ์ด๋ฃจ์ด์ ธ์๋ค. ํด๋น ๋ด์ฉ ํ์ฑํด๋ณด์.
๐ธ py
import requests
from bs4 import BeautifulSoup
indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")
indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser')
# ์ ์ฒด ์ถ๋ ฅ
# print(indeed_soup)
# <title>
# print(indeed_soup.title)
# ํ์ด์ง div ์ฐพ๊ธฐ
pagination = indeed_soup.find("div", {"class": "pagination"})
# a link ์ฐพ๊ธฐ
pages = pagination.find_all('a')
# <span class="pn">๋ฒํธ</span> ์ฐพ๊ธฐ
spans = []
for page in pages:
spans.append(page.find("span"))
print("---------span/class:pn---------")
print(spans)
print("---------List์์ ๊ฐ์ฅ ๋ง์ง๋ง item---------")
# -1: ๋ง์ง๋ง์์๋ถํฐ ์์ํด์ ์ฒซ item
print(spans[-1])
print("---------List์์ ๊ฐ์ฅ ๋ง์ง๋ง item ๋นผ๊ณ ์กฐํ---------")
# ๋ง์ง๋ง ์์ดํ
๋นผ๊ณ ์กฐํ
# spans[0:5] 0๋ถํฐ 5๊ฐ ์กฐํ
print(spans[:-1])
๐น console
---------span/class:pn---------
[<span class="pn">2</span>, <span class="pn">3</span>, <span class="pn">4</span>, <span class="pn">5</span>, <span class="pn"><span class="np"><svg fill="none" height="24" width="24"><path d="M10 6L8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6-6-6z" fill="#2D2D2D"></path></svg></span></span>]
---------List์์ ๊ฐ์ฅ ๋ง์ง๋ง item---------
<span class="pn"><span class="np"><svg fill="none" height="24" width="24"><path d="M10 6L8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6-6-6z" fill="#2D2D2D"></path></svg></span></span>
---------List์์ ๊ฐ์ฅ ๋ง์ง๋ง item ๋นผ๊ณ ์กฐํ---------
[<span class="pn">2</span>, <span class="pn">3</span>, <span class="pn">4</span>, <span class="pn">5</span>]
# 2.4 Extracting Indeed Pages part Two
pages.append(link.("span").string)
๋ pages.append(link.string)
์ ๊ฐ๋ค.
beautiful soup ์์ link > span ์์ string์ ์ฝ์ด์ค๋ค.
๐ธ py
import requests
from bs4 import BeautifulSoup
indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")
indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser')
# ํ์ด์ง div ์ฐพ๊ธฐ
pagination = indeed_soup.find("div", {"class": "pagination"})
# a link ์ฐพ๊ธฐ
links = pagination.find_all('a')
# <span class="pn">๋ฒํธ</span> ์ฐพ๊ธฐ
pages = []
for link in links[:-1]:
#pages.append(link.("span").string) ์ ๊ฐ์.
# int๋ก cast
pages.append(int(link.string))
# html๋ด ๋ง์ง๋ง ํ์ด์ง
max_page = pages[-1]
print(max_page)
๐น console
5
# 2.5 Requesting Each Page
๐ ๋ชจ๋ ํ์ด์ง๋ฅผ requestํด๋ณด์.
ํ์ผ์ ๋ฐ๋ก ๋์ด function์ ์ ๋ฆฌํด์คฌ๋ค.
๐ธ main.py
from indeed import extract_indeed_pages, extract_indeed_jobs
# page ๋ฒํธ ์ ๋ถ ์กฐํ ํ,
last_indeed_page = extract_indeed_pages()
# request ๋ ๋ ค๋ณด๊ธฐ status 200 ์ฑ๊ณต ์ถ๋ ฅ
extract_indeed_jobs(last_indeed_page)
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
# range(N): N๊ฐ์ ๋ฐฐ์ด์ ์์ฑํด์ค.
def extract_indeed_jobs(last_page):
for page in range(last_page):
result = requests.get(f"{URL}&start={page*LIMIT}")
print(result.status_code)
๐น console
200
200
200
200
200
# 2.6 Extracting Titles
์ฌ์ดํธ ๋งํฌ์ ์ด ์ข ๋ฌ๋ผ์ ธ์ ๊ฐ์๋ฅผ ๋ฃ๊ณ , ๋ด ๋ฐฉ์๋๋ก ์์ ํ์ฌ title์ ๊ฐ์ ธ์๋ค.
์ฌ๊ธฐ์๋ถํฐ ๊ฐ์์ ์ฝ๋๊ฐ ์ข ๋ค๋ฅผ ์์ ,,
๐ธ py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
jobs = []
# 1ํ์ด์ง๋ก ํ
์คํธํ๊ธฐ ์ํด for๋ฌธ ์ฃผ์ ์ฒ๋ฆฌ
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, 'html.parser')
results = soup.find_all("div", {"class": "job_seen_beacon"})
for result in results:
title = result.find("td", {"class": "resultContent"}).find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
print(title) # title
return jobs
๐น console
Python/Django Developer
Software Development (All Levels)
Python - Machine Learning SME
Coding teacher for teaching Scratch & Python
Python Developer
Python Developer
Fraud Modeler - Credit Cards / Banking
Logistics Analyst
Online Python Teacher
Python Engineer
Python developer
Python Developer
Analytic Methodologist
Data Scientist with Python
Remote Python Developer
Remote Python Developer
Python Developer Ex Google
Python Developer
3D Solutions Analyst (REMOTE)
Entry Level Software Engineer
Backend Engineer (Python)
Entry Level Python Developer
Data Analyst
Lead Python Developer
Python Developer
Python Developer
Python developer
Data Scientist
AWS Python Developer
Data Analyst: Technical Business Intelligence
USSTRATCOM - Analytic Methodologist
Informatica for Google BigQuery
MACHINE LEARNING DATA SCIENTIST - PYTHON AND R
Python Developer
Database Developer
Basketball Data Scientist (remote opportunity)
Python Developer
Python Developer with AWs
Python Developer
Junior Trader (Remote)
Python Developer
Jr. Software Engineer
Python Developer
Backend software engineer (python)
Python Developer
Software Developer โ Entry Level
Python Engineer
Data Scientist
Python Developer
Python Developer
# 2.7 Extracting Companies
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
jobs = []
# 1ํ์ด์ง๋ก ํ
์คํธํ๊ธฐ ์ํด for๋ฌธ ์ฃผ์ ์ฒ๋ฆฌ
#for page in range(last_page):
result = requests.get(f"{URL}&start={0*LIMIT}")
soup = BeautifulSoup(result.text, 'html.parser')
results = soup.find_all("div", {"class": "job_seen_beacon"})
for result in results:
resultContent = result.find("td", {"class": "resultContent"})
# title
title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
# company
company = resultContent.find("div", {"class":"company_location"}).find("span", {"class": "companyName"}).string
print(f"{title} >>> {company}")
return jobs
๐น console
Software Development (All Levels) >>> Insurity
Entry Level Software Engineer >>> Avant
Python/Django Developer >>> Delta
Python - Machine Learning SME >>> Envision
Coding teacher for teaching Scratch & Python >>> YoumeCan Education Center
Python Developer >>> Mechlance INC
Python Developer >>> Mobile Mechanic
Fraud Modeler - Credit Cards / Banking >>> Acunor
Logistics Analyst >>> Lululemon
Software Engineer (Early Career) >>> Apple
Python Engineer >>> Highbrow-Tech
Online Python Teacher >>> YC Solutions Pvt. Ltd.
Python Developer >>> Integration Developer Network LLC
Python/Odoo ERP Developer >>> Novobi, LLC
Python developer >>> Vyze, Inc.
Remote Python Developer >>> Piper Companies
Software Engineer I ( 100% Remote) >>> Windstream Communications
Data Scientist with Python >>> Techladder
Analytic Methodologist >>> Constellation West
Remote Python Developer >>> CTI Consulting
Python Developer Ex Google >>> Laiba Technologies
3D Solutions Analyst (REMOTE) >>> Under Armour
Entry Level Python Developer >>> Marlabs
Backend Engineer (Python) >>> Metaverse HQ
Python Developer >>> WorkCog inc
Data Analyst >>> Young Life
MACHINE LEARNING DATA SCIENTIST - PYTHON AND R >>> InspiHER Tech
Lead Python Developer >>> Interaslabs.llc
Python Developer >>> Yaddala Consulting
Data Analyst: Technical Business Intelligence >>> Barstool Sports
USSTRATCOM - Analytic Methodologist >>> Apogee Engineering, LLC
AWS Python Developer >>> Inscope Global Solutions
Data Scientist >>> FIIDUS
Informatica for Google BigQuery >>> Kaygen
Python Developer >>> Business Intelli Solutions
Python Developer >>> AGM Tech Solutions, INC.
Database Developer >>> Integration Developer Network LLC
Python developer >>> Stefanini Group
Python Developer >>> Aquatic Capital Management
Python Developer >>> Santex Group
Python Developer with AWs >>> Innovative BI Solutions Inc
Basketball Data Scientist (remote opportunity) >>> Madison Square Garden Entertainment
Jr. Software Engineer >>> NBCUniversal
Backend software engineer (python) >>> Benchmark IT Solutions
Python Developer >>> Swan Software Solutions
Python Developer >>> Benedsoft
Software Developer โ Entry Level >>> Grant Street Group
Python Developer >>> Infinizi IT Solutions Pvt. Ltd.
Python Engineer >>> Techno corporation inc
Python Developer >>> Morgan Stanley
# 2.8 Extracting Locations and Finishing up
id
๊น์ง ๊ฐ์ ธ์ค๋๋ก ํ์ฑํ๊ธฐ
https://www.indeed.com/viewjob?jk={id}
์ ๊ฐ์ด ๋งํฌ ์์ฑ ๊ฐ๋ฅ!
๐ธ main.py
from indeed import extract_indeed_pages, extract_indeed_jobs
last_indeed_page = extract_indeed_pages()
indeed_jobs = extract_indeed_jobs(last_indeed_page)
print(indeed_jobs)
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def extract_indeed_pages():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ํ์ฌ ์ ๋ณด ํ์ฑ
def extract_job(html):
resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
# id
id = html["data-jk"]
# title
title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
# company
company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
# location
location = resultContent.find("div", {"class": "companyLocation"}).string
return {'id': id, 'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
jobs = []
# ๋ชจ๋ ํ์ด์ง ์ง์
์กฐํ
for page in range(last_page):
print(f"Scrapping page {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, 'html.parser')
results = soup.find_all("a", {"class": "tapItem"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
๐น console
Scrapping page 0
Scrapping page 1
Scrapping page 2
Scrapping page 3
Scrapping page 4
[{'id': 'b6d26975703d41c2', 'title': 'Python - Machine Learning SME', 'company': 'Envision', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=b6d26975703d41c2'}, {'id': '5a91a49780ab17df', 'title': 'Sr Data Scientist', 'company': 'Zillow', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=5a91a49780ab17df'}, {'id': '2bcd843c58159429', 'title': 'Software Engineer (Early Career)', 'company': 'Apple', 'location': None, 'link': 'https://www.indeed.com/viewjob?jk=2bcd843c58159429'}, {'id': '6d0ab231885eac14', 'title': 'GIS Analyst', 'company': 'Bruce Harris & Associates, Inc', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=6d0ab231885eac14'}, {'id': '788dc9dd8ade27a6', 'title': 'Python/Django Developer', 'company': 'Delta', 'location': None, 'link': 'https://www.indeed.com/viewjob?jk=788dc9dd8ade27a6'}, {'id': 'ebcc2ad72eda66fb', 'title': 'QT Software Engineer (Python and C++)', 'company': 'TriSearch', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=ebcc2ad72eda66fb'}, {'id': '57654f0e7ccfc3b7', 'title': ' ... ์๋ต
# 2.9 StackOverflow Pages
https://stackoverflow.com/jobs/companies?q=python
๋ฅผ ํ์ฑํด๋ณด์ ! python์ผ๋ก ๊ฒ์ํ๋ url์ด๋ค.
์ฐ์ stackoverflow์ ๊ตฌ๋ถํ๊ธฐ ์ํด์ indeed.py
, so.py
๋ก ๋๋ ์ฃผ์๊ณ , main.py
์ฝ๋๋ฅผ ์ข ์ ๋ฆฌํ์๋ค.
so.py
๋ฅผ ํ
์คํธํ๊ธฐ ์ํด ํ
์คํธ๋ฅผ ์ํด ๊ธฐ์กด indeed ํ์ฑ ๋ฉ์๋๋ ์ฃผ์์ฒ๋ฆฌ ํด์ฃผ์๊ณ , pagination
์ a
ํ๊ทธ๋ฅผ find_all
๋ก ์ฐ์ ๊ฐ์ ธ์๋ค.
๐ธ main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
#indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ํ์ฌ ์ ๋ณด ํ์ฑ
def extract_job(html):
resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
# id
id = html["data-jk"]
# title
title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
# company
company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
# location
location = resultContent.find("div", {"class": "companyLocation"}).string
return {'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
jobs = []
# ๋ชจ๋ ํ์ด์ง ์ง์
์กฐํ
for page in range(last_page):
print(f"Scrapping page {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, 'html.parser')
results = soup.find_all("a", {"class": "tapItem"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
๐ธ so.py
import requests
from bs4 import BeautifulSoup
URL = f"https://stackoverflow.com/jobs/companies?q=python"
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
print(pages)
def get_jobs():
last_page = get_last_page()
return []
๐น console
[<a class="s-pagination--item is-selected" href="/jobs/companies?q=python" title="page 1 of 21">
<span>1</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&pg=2" title="page 2 of 21">
<span>2</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&pg=3" title="page 3 of 21">
<span>3</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&pg=4" title="page 4 of 21">
<span>4</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&pg=21" title="page 21 of 21">
<span>21</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&pg=2" title="page 2 of 21">
<span>next</span><i class="material-icons">chevron_right</i>
</a>]
# 2.10 StackOverflow extract jobs
strip
get_text(strip=True)
๋ฅผ ์ฌ์ฉํ๋ฉด text๋ฅผ ๊ฐ์ ธ์ด๊ณผ ๋์์ ์๋ค ๊ณต๋ฐฑ์ ์๋ผ์ค๋ค.
์ฐธ๊ณ (opens new window)
markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
soup = BeautifulSoup(markup, 'html.parser')
soup.get_text()
'\nI linked to example.com\n'
soup.i.get_text()
'example.com'
soup.get_text("|", strip=True)
'I linked to|example.com'
๐ธ so.py
import requests
from bs4 import BeautifulSoup
URL = f"https://stackoverflow.com/jobs/companies?q=python"
# ๋ง์ง๋ง ํ์ด์ง ๊ฐ์ ธ์ค๊ธฐ
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
# ๋ง์ง๋ง(-1)์ next๋ฒํผ์ด๋ฏ๋ก ๋ง์ง๋ง์์ 2๋ฒ์งธ๊ฑฐ(-2)๊ฐ last page
# strip=True๋ฅผ ํ์ฉํ์ฌ ์๋ค ๊ณต๋ฐฑ ์๋ฅด๊ธฐ
last_page = pages[-2].get_text(strip=True)
return int(last_page)
# ํ์ฌ ๊ฐ์ ธ์ค๊ธฐ
def extract_companies(last_page):
companies = []
# last page์ ๊ฐ์๋งํผ ๋ฐฐ์ด ๋ง๋ค์ด์ for๋ฌธ ๋๋ฆฌ๊ธฐ
for page in range(last_page):
result = requests.get(f"{URL}&pg={page + 1}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "-company"})
for result in results:
print(result.find("div", {"class": "dismiss-trigger"})["data-id"])
def get_jobs():
last_page = get_last_page()
companies = extract_companies(last_page)
return companies
๐น console
31152
17914
26760
32154
3060
...
์ค๋ต
...
32169
4603
32176
23691
20917
# 2.11~12 StackOverflow extract job
ํ์ฌ ์ ๋ณด๋ฅผ ๊ฐ์ ธ์ ํ์ฑํด์ฃผ์๋ค.
recursive
find_all("title", recursive=False)
๋ฅผ ์ฌ์ฉํ๋ฉด ์ฒซ๋จ๊ณ๋ง ์ฐพ๊ณ , ๊ทธ ์์ ๊น์ํ ํ๊ทธ๋ ์ฐพ์ง ์๋๋ค.
์ฐธ๊ณ (opens new window)
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]
soup.html.find_all("title", recursive=False)
# []
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
URL = f"https://stackoverflow.com/jobs/companies?q=python"
# ๋ง์ง๋ง ํ์ด์ง ๊ฐ์ ธ์ค๊ธฐ
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
# ๋ง์ง๋ง(-1)์ next๋ฒํผ์ด๋ฏ๋ก ๋ง์ง๋ง์์ 2๋ฒ์งธ๊ฑฐ(-2)๊ฐ last page
# strip=True๋ฅผ ํ์ฉํ์ฌ ์๋ค ๊ณต๋ฐฑ ์๋ฅด๊ธฐ
last_page = pages[-2].get_text(strip=True)
return int(last_page)
def extract_company(html):
content = html.find("div", {"class": "flex--item fl1 text mb0"})
# company
company = content.find("h2").find("a", {"class": "s-link"}).string
location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
# location
location = location.get_text(strip=True)
# industry
industry = industry.get_text(strip=True)
print(location, industry)
return {"company": company, "location": location, "industry": industry}
# ํ์ฌ ๊ฐ์ ธ์ค๊ธฐ
def extract_companies(last_page):
companies = []
# last page์ ๊ฐ์๋งํผ ๋ฐฐ์ด ๋ง๋ค์ด์ for๋ฌธ ๋๋ฆฌ๊ธฐ
for page in range(last_page):
result = requests.get(f"{URL}&pg={page + 1}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "-company"})
for result in results:
company = extract_company(result)
companies.append(company)
return companies
def get_jobs():
last_page = get_last_page()
companies = extract_companies(last_page)
return companies
๐น console
Edinburgh; Beirut; Bozeman Cloud Computing, Education Technology, SaaS
Dublin 1 Agile Software Development, Cloud-Based Solutions, Computer Software
Mรผnchen Computer Vision, Image Guided Surgery, Medical Imaging
United States Cybersecurity, Healthcare
Elkridge; Linthicum Heights; Vienna Computer Software
...
์ค๋ต
...
No office location Retail, Technical Services, Web Technology
Fulton Business to Business, Security Software
No office location Bioinformatics, Computer Software, Digital Health
No office location Agile Software Development, Software Development / Engineering, Technology Staffing
Berlin Agile Software Development, Automotive
# 2.13 StackOverflow Finish
indeed
์ stackoverflow
์์ ํ์ฑํ ๊ฒ๋ค์ ํฉ์ณ์ฃผ์.
๐ธ main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()
jobs = so_jobs + indeed_jobs
๊ฐ๊ฐ scrapping ์ฑ๊ณต์ฌ๋ถ๋ฅผ ํ์ธํ๊ธฐ ์ํด ์๋์ ๊ฐ์ด for๋ฌธ์์ print
ํด์ฃผ์
๐ธ indeed.py
.
.
.
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
jobs = []
# ๋ชจ๋ ํ์ด์ง ์ง์
์กฐํ
for page in range(last_page):
print(f"Scrapping ID: Page: {page}")
.
.
.
๐ธ so.py
.
.
.
# ํ์ฌ ๊ฐ์ ธ์ค๊ธฐ
def extract_companies(last_page):
companies = []
# last page์ ๊ฐ์๋งํผ ๋ฐฐ์ด ๋ง๋ค์ด์ for๋ฌธ ๋๋ฆฌ๊ธฐ
for page in range(last_page):
print(f"Scrapping SO: Page: {page}")
.
.
.
๐น console
Scrapping ID: Page: 0
Scrapping ID: Page: 1
Scrapping ID: Page: 2
Scrapping ID: Page: 3
Scrapping ID: Page: 4
Scrapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
Scrapping SO: Page: 19
Scrapping SO: Page: 20
# 2.14 What is CSV
CSV
: Comma Separated Values
- vsCode ์์
ExcelViewer
ํ๋ฌ๊ทธ์ธ์ ์ค์นํ๋ค.
- we.csv ํ์ผ์ ์์ฑํ๋ค.
name, last Name, age, gender
nico, serrano, 12, male
nico, serrano, 12, male
nico, serrano, 12, male
- we.csv ํ์ผ์ vsCode์์ preview๋ก ์ด์ด๋ณธ๋ค.
- google spreadsheet์์ ํ์ผ์ ์ ๋ก๋ํด๋ณธ๋ค.
์์๋ก ์๋์ ๊ฐ์ด save.py
ํ์ผ ์์ฑ
๐ธ main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file
indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()
jobs = so_jobs + indeed_jobs
save_to_file(jobs)
๐ธ save.py
import csv
def save_to_file(jobs):
return
# 2.15 Saving to CSV
๐ธ main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file
indeed_jobs = get_indeed_jobs()
#so_jobs = get_so_jobs()
jobs = indeed_jobs
save_to_file(jobs)
๐ธ save.py
import csv
def save_to_file(jobs):
file = open("jobs.csv", mode="w")
writer = csv.writer(file)
# ํค๋์ค ์์ฑ
writer.writerow(["title", "company", "location", "link"])
for job in jobs:
# dict์์ values๋ง ๊ฐ์ ธ์ค๋ฉด dict_values๊ฐ type์
# ๋ฐ๋ผ์ list๋ก cast ํด์ค๋ค
writer.writerow(list(job.values()))
return
๐น jobs.csv
title,company,location,link
Remote Python Developer,CTI Consulting,,https://www.indeed.com/viewjob?jk=75422ff0a5cfbe28
Python Developer,Aquatic Capital Management,"Remote in Chicago, IL",https://www.indeed.com/viewjob?jk=5bdb3c2265c60c4a
Senior Python Developer,Gallup,,https://www.indeed.com/viewjob?jk=b3a32bc1a87689e7
C++/Python Developer,FIIDUS,Remote,https://www.indeed.com/viewjob?jk=3720520a9b3f386f
Informatica for Google
.
.
.
# 2.16 OMG THIS IS AWESOME
๋ง์ง๋ง์ผ๋ก ์ฝ๋ ์ ๋ฆฌ๋ฅผ ํ๋ฉด ์๋์ ๊ฐ์ด ๋๋ฉฐ, csv ํ์ผ ๋๊ฐ๊ฐ ์์ฑ๋๋ ๊ฒ์ ํ์ธํ ์ ์๋ค.
python_scrapper_replit (opens new window)
๐ธ main.py
from indeed import get_jobs as get_indeed_jobs
from so import get_companies as get_so_companies
from save import save_to_file_jobs, save_to_file_companies
indeed_jobs = get_indeed_jobs()
so_companies = get_so_companies()
save_to_file_jobs(indeed_jobs)
save_to_file_companies(so_companies)
๐ธ indeed.py
import requests
from bs4 import BeautifulSoup
LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"
# pagination ์ค ๋ง์ง๋ง ํ์ด์ง ๋ฒํธ
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, 'html.parser')
# html๋ด ๋ง์ง๋ง ํ์ด์ง ์ฐพ๊ธฐ
pagination = soup.find("div", {"class": "pagination"})
links = pagination.find_all('a')
pages = []
for link in links[:-1]:
pages.append(int(link.string))
max_page = pages[-1]
return max_page
# ํ์ฌ ์ ๋ณด ํ์ฑ
def extract_job(html):
resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
# id
id = html["data-jk"]
# title
title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
# company
company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
# location
location = resultContent.find("div", {"class": "companyLocation"}).string
return {'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}
# ๊ฐ page์ start index ๊ตฌํ์ฌ request ๋ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
jobs = []
# ๋ชจ๋ ํ์ด์ง ์ง์
์กฐํ
for page in range(last_page):
print(f"Scrapping ID: Page: {page}")
result = requests.get(f"{URL}&start={page*LIMIT}")
soup = BeautifulSoup(result.text, 'html.parser')
results = soup.find_all("a", {"class": "tapItem"})
for result in results:
job = extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page = get_last_page()
jobs = extract_jobs(last_page)
return jobs
๐ธ so.py
import requests
from bs4 import BeautifulSoup
URL = f"https://stackoverflow.com/jobs/companies?q=python"
# ๋ง์ง๋ง ํ์ด์ง ๊ฐ์ ธ์ค๊ธฐ
def get_last_page():
result = requests.get(URL)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
# ๋ง์ง๋ง(-1)์ next๋ฒํผ์ด๋ฏ๋ก ๋ง์ง๋ง์์ 2๋ฒ์งธ๊ฑฐ(-2)๊ฐ last page
# strip=True๋ฅผ ํ์ฉํ์ฌ ์๋ค ๊ณต๋ฐฑ ์๋ฅด๊ธฐ
last_page = pages[-2].get_text(strip=True)
return int(last_page)
def extract_company(html):
content = html.find("div", {"class": "flex--item fl1 text mb0"})
# company
company = content.find("h2").find("a", {"class": "s-link"}).string
location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
# location
location = location.get_text(strip=True)
# industry
industry = industry.get_text(strip=True)
# link
link = content.find("h2").find("a", {"class": "s-link"})['href']
return {"company": company, "location": location, "industry": industry, "apply_link": f"https://stackoverflow.com{link}"}
# ํ์ฌ ๊ฐ์ ธ์ค๊ธฐ
def extract_companies(last_page):
companies = []
# last page์ ๊ฐ์๋งํผ ๋ฐฐ์ด ๋ง๋ค์ด์ for๋ฌธ ๋๋ฆฌ๊ธฐ
for page in range(last_page):
print(f"Scrapping SO: Page: {page}")
result = requests.get(f"{URL}&pg={page + 1}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "-company"})
for result in results:
company = extract_company(result)
companies.append(company)
return companies
def get_companies():
last_page = get_last_page()
companies = extract_companies(last_page)
return companies
๐ธ save.py
import csv
def save_to_file_jobs(jobs):
file = open("jobs.csv", mode="w")
writer = csv.writer(file)
# ํค๋์ค ์์ฑ
writer.writerow(["title", "company", "location", "link"])
for job in jobs:
# dict์์ values๋ง ๊ฐ์ ธ์ค๋ฉด dict_values๊ฐ type์
# ๋ฐ๋ผ์ list๋ก cast ํด์ค๋ค
writer.writerow(list(job.values()))
return
def save_to_file_companies(companies):
file = open("companies.csv", mode="w")
writer = csv.writer(file)
# ํค๋์ค ์์ฑ
writer.writerow(["company", "location", "industry", "apply_link"])
for company in companies:
# dict์์ values๋ง ๊ฐ์ ธ์ค๋ฉด dict_values๊ฐ type์
# ๋ฐ๋ผ์ list๋ก cast ํด์ค๋ค
writer.writerow(list(company.values()))
return
๐น console
Scrapping ID: Page: 0
Scrapping ID: Page: 1
Scrapping ID: Page: 2
Scrapping ID: Page: 3
Scrapping ID: Page: 4
Scrapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
Scrapping SO: Page: 19
Scrapping SO: Page: 20
๐น jobs.csv
title,company,location,link
"Security Engineer- AWS, Python",The Getch,Remote,https://www.indeed.com/viewjob?jk=5fba88b67d1b72dc
Python Developer,Paktolus,Remote,https://www.indeed.com/viewjob?jk=df1cce3cc988f374
Python Developer,Simplified IT Solutions,Remote,https://www.indeed.com/viewjob?jk=7e3a3e84485bb544
Python Developer,EMR CPR LLC,"Austin, TX",https://www.indeed.com/viewjob?jk=52298adb7d458010
Senior Python AWS Developer,DataAxxis,,https://www.indeed.com/viewjob?jk=cd01de5c4adc7ac4
Associate Solutions Architect โ Early Career 2022 (US),"Amazon Web Services, Inc.",,https://www.indeed.com/viewjob?jk=3905541e1957ec4a
Python Developer,Oremda Infotech Inc.,Remote,https://www.indeed.com/viewjob?jk=2c04530f755a2932
Senior Software Engineer,University of Nebraska Medical Center,,https://www.indeed.com/viewjob?jk=067a8cf9dccbdc70
...์๋ต
๐น companies.csv
company,location,industry,apply_link
Administrate,Edinburgh; Beirut; Bozeman,"Cloud Computing, Education Technology, SaaS",https://stackoverflow.com/jobs/companies/administrate?c=MYHq0mvrMlWD3iKY&q=python
"Arista Networks, Inc",Dublin 1,"Agile Software Development, Cloud-Based Solutions, Computer Software",https://stackoverflow.com/jobs/companies/www-arista-com?
...์๋ต
# โก 3. GET READY FOR DJANGO
# 3.0 Django is AWESOME
Django (opens new window) ์๊ฐ !
# 3.1 *args **kwargs
Django๋ ๋ฌดํ arguments(*args)๋ฅผ ์ค ์ ์๋ค. ํ์ง๋ง key=value์ธ argument๋ฅผ ์ฃผ๋ ค๋ฉด **kwargs
๋ฅผ ์จ์ผํ๋ค. keyword arguments
์ ์ถ์ฝ์ด์ด๋ค.
๐ธ py
def plus(a, b, *args, **kwargs):
print(args)
print(kwargs) # key=value๊ฐ์ keyword argument๋ก ๋ฐ์์ผํจ
return a + b
plus(1, 2, 3, 4, 5, 1, 2, 3, 4, 3, 4, 5, hello=True, bye=True)
๐น console
(3, 4, 5, 1, 2, 3, 4, 3, 4, 5)
{'hello': True, 'bye': True}
๋ฌดํ ๊ณ์ฐ๊ธฐ๋ฅผ ๋ง๋ค๋ฉด ์๋์ ๊ฐ๋ค.
๐ธ py
def plus(*args):
result = 0
for number in args:
result += number
print(result)
plus(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
๐น console
55
# 3.2 Intro to Object Oriented Programming
๐ธ py
# ์ฒญ์ฌ์ง (blueprint)
class Car():
wheels = 4
doors = 4
windows = 4
seats = 4
porche = Car() #instance
porche.color = "Red"
print(porche.windows, porche.color)
ferrari = Car()
ferrari.color = "Yellow"
print(ferrari.windows, ferrari.color)
mini = Car()
mini.color = "White"
๐น console
4 Red
4 Yellow
# 3.3 Methods part One
method
class ์์ ์๋ function์ method๋ผ๊ณ ํ๋ค.
์ ์ญ์ผ๋ก ์ ์ธ๋์ ๊ฒฝ์ฐ์๋ function / ํด๋์ค ์์ ์ ์ธ๋ ๊ฒฝ์ฐ์๋ method
python์ ๋ชจ๋ method์ 1๊ฐ์ argument๋ฅผ ํ์๋ก ๊ฐ๋๋ค.
method
์ ์ฒซ๋ฒ์งธ argument๋ method๋ฅผ ํธ์ถํ๋ ์๊ธฐ ์์ , instance์ด๋ค.
๐ธ py
# ์ฒญ์ฌ์ง (blueprint)
class Car():
wheels = 4
doors = 4
windows = 4
seats = 4
# method (ํด๋์ค ์์ ์์ผ๋ฉด method/๋ฐ์ด๋ฉด function)
def start(self):
print(self.doors)
print(self.color)
print("I started")
porche = Car()
porche.color = "RED"
porche.start()
๐น console
4
RED
I started
# 3.4 Methods part Two
๐ธ py
# ์ฒญ์ฌ์ง (blueprint)
class Car():
# init์ผ๋ก ๋ฐ๊ฟ์ฃผ๋ ๊ฒ ๋ฐ๋์งํจ
def __init__(self, **kwargs):
# print(kwargs)
self.wheels = 4
self.doors = 4
self.windows = 4
self.seats = 4
# ๊ฐ์ด ์์ ๊ฒฝ์ฐ, ๋๋ฒ์งธ ์ธ์ ํ ๋น
self.color = kwargs.get("color", "black")
self.price = kwargs.get("price", "$20")
# method override
def __str__(self):
return f"Car with {self.wheels} wheels"
# dir ํด๋์ค ์ ๋ชจ๋ ๊ฒ๋ค์ list๋ก ๋ณด์ฌ์ค๋ค.
# print(dir(Car))
porche = Car(color="green", price="$40")
# porche๋ฅผ ํธ์ถํ ๋๋ง๋ค ๊ธฐ๋ณธ ๋ฉ์๋์ธ __str__์ ํธ์ถ
print(porche)
print(porche.color, porche.price)
mini = Car()
print(mini.color, mini.price)
๐น console
Car with 4 wheels
green $40
black $20
# 3.5 Extending Classes
๐ธ py
# ์ฒญ์ฌ์ง (blueprint)
class Car():
# init์ผ๋ก ๋ฐ๊ฟ์ฃผ๋ ๊ฒ ๋ฐ๋์งํจ
def __init__(self, **kwargs):
# print(kwargs)
self.wheels = 4
self.doors = 4
self.windows = 4
self.seats = 4
# ๊ฐ์ด ์์ ๊ฒฝ์ฐ, ๋๋ฒ์งธ ์ธ์ ํ ๋น
self.color = kwargs.get("color", "black")
self.price = kwargs.get("price", "$20")
# method override
def __str__(self):
return f"Car with {self.wheels} wheels"
# extends Car class
class Convertible(Car):
# ๋ถ๋ชจ init์ ์ถ๊ฐ ์์
def __init__(self, **kwargs):
super().__init__(**kwargs) # ๋ถ๋ชจ ํด๋์ค ํธ์ถ
self.time = kwargs.get("time", 10)
# add method
def take_off(self):
return "taking off"
# override
def __str__(self):
return f"Car with no roof"
porche = Convertible(color="green", price="$40")
mini = Car()
print(porche)
print(porche.color)
print(porche.take_off())
๐น console
Car with no roof
green
taking off
# โก 4. 2020 BONUS CLASS
# 4.0 Welcome to 2020 Update
Flask
: ํ์ด์ฌ์ผ๋ก ์น ์ฌ์ดํธ๋ฅผ ๋ง๋ค ์ ์๊ฒ ํด์ฃผ๋ micro-framework
Flask
๋ฅผ ํ์ฉํ์ฌ scrapper๋ฅผ ์น์๋ฒ์ ์ฌ๋ฆด ์์ !
# 4.1 Introduction to Flask
replit
ํจํค์ง์์ Flask
๋ฅผ ๊ฒ์ ํ installํด์ค๋ค.
๐ธ py
from flask import Flask
app = Flask("SuperScrapper")
app.run(host="0.0.0.0")
๐น web
์ฝ๋๊ฐ ๋ฐ๋๋๋ง๋ค ๋ค์ run
ํด์ค๋ค.
๐ธ py
from flask import Flask
app = Flask("SuperScrapper")
@app.route("/")
def home():
return "Hello! Welcome to mi casa!"
@app.route("/contact")
def potato(): # ์ด๋ฆ ๋ฌ๋ผ๋ ๋จ
return "Contact me!"
app.run(host="0.0.0.0")
๐น web
# 4.2 Dynamic URLs and Templates
๐ธ py
from flask import Flask
app = Flask("SuperScrapper")
@app.route("/")
def home():
return "Hello! Welcome to mi casa!"
# placeholder๋ฅผ ๋ฐ์ผ๋ฉด ์ฌ์ฉํด์ค์ผ ํ๋ค. ์๊ทธ๋ผ error๋ฐ์
# TypeError: potato() got an unexpected keyword argument 'username'
@app.route("/<username>")
def potato(username):
return f"Hello your name is {username}"
app.run(host="0.0.0.0")
๐น web
templates
ํด๋ ์์ฑ ํ, html
ํ์ผ์ ํ๋ ๋ง๋ค์ด์ฃผ์.
๐ธ potato.html
<!DOCTYPE html>
<html>
<head>
<title>Job Search</title>
</head>
<body>
<h1>Job Search</h1>
<form>
<input placeholder='Search for a job' required/>
<button>Search</button>
</form>
</body>
</html>
๐ธ py
from flask import Flask, render_template
app = Flask("SuperScrapper")
@app.route("/")
def home():
return render_template("potato.html")
app.run(host="0.0.0.0")
๐น web
์ด๋, ๊ฒฝ๋ก๋ฅผ ์ฃผ์ง ์์๋ htmlํ์ผ์ ์ฐพ์์ ๋์์ฃผ๋๋ฐ, flask
๊ฐ ์ ์ ๋ก ์ฐพ์์ค์ ๋ฐ ์ ์๋ ๊ฑฐ๋ค! ์ ๊ธฐ..
# 4.3 Forms and Query Arguments
report.html
ํ
ํ๋ฆฟ์ ํ๋ ๋ ๋ง๋ค์ด์ฃผ๊ณ , ๊ฒ์๋ฒํผ์ ๋๋ฅด๋ฉด ์๋์ ๊ฐ์ด query
๋ฅผ ๋ฐ์์ค๋๋ก ํด์ค๋ค.
๋ฐ์ ๊ฐ์ render_template์ผ๋ก report.html
์ ๋๊ฒจ rendering
ํด์ค๋ค.
๐ธ report.html
<!DOCTYPE html>
<html>
<head>
<title>Job Search</title>
</head>
<body>
<h1>Search Results</h1>
<h3>You are looking for {{searchingBy}}</h3>
<h4>{{color}}</h4>
</form>
</body>
</html>
๐ธ main.py
from flask import Flask, render_template, request
app = Flask("SuperScrapper")
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word') # query ๊ฐ์ ธ์ค๊ธฐ
return render_template("report.html", searchingBy=word, color="RED") # ๊ฐ์ ๋๊ฒจ์ค
app.run(host="0.0.0.0")
๐น web
# 4.4 Scrapper Integration
๊ฒ์์ด๊ฐ ๋์ด์ค๋ฉด ์๋ฌธ์๋ก ๋ณ๊ฒฝํ๋๋ก ์์ ํ์๋ค.
๋ํ ๊ฒ์์ด ์์ด /report
url๋ก๋ง ์ ๊ทผํ์ ๋๋ redirect
๋๋๋ก ๋ณ๊ฒฝํ์๋ค.
๐ธ py
from flask import Flask, render_template, request, redirect
app = Flask("SuperScrapper")
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html", searchingBy=word, color="RED")
app.run(host="0.0.0.0")
์ด์ ์ ๋ง๋ค์ด๋์ scrapper (opens new window)๋ฅผ ์น์ฌ์ดํธ repl
์ ๋ณต๋ถํด๋ณด์. (so.py
๋ฅผ ๊ฐ์ ธ์ scrapper.py
ํ์ผ๋ก ๋ณต๋ถ)
์น์ฌ์ดํธ scrapper repl
์๋ requests
์ beautifulsoup
์ด ์์ผ๋ฏ๋ก ํจํค์ง install์ ํด์ฃผ์.
get_companies
์์ word
๋ฅผ ๋ฐ์ ์ ์๊ฒ ํด์ฃผ๊ณ , url
๋ ํด๋น function์์ ์์ ํด์ get_last_page
๋ก ๋๊ฒจ์ค๋ค. extract_companies
๋ url
์ ๋ฐ์ ์ ์๋๋ก ๋ง์ถฐ์ ์์ ํด์ค๋ค.
๐ธ scrapper.py
import requests
from bs4 import BeautifulSoup
# ๋ง์ง๋ง ํ์ด์ง ๊ฐ์ ธ์ค๊ธฐ
def get_last_page(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, "html.parser")
pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
# ๋ง์ง๋ง(-1)์ next๋ฒํผ์ด๋ฏ๋ก ๋ง์ง๋ง์์ 2๋ฒ์งธ๊ฑฐ(-2)๊ฐ last page
# strip=True๋ฅผ ํ์ฉํ์ฌ ์๋ค ๊ณต๋ฐฑ ์๋ฅด๊ธฐ
last_page = pages[-2].get_text(strip=True)
return int(last_page)
def extract_company(html):
content = html.find("div", {"class": "flex--item fl1 text mb0"})
# company
company = content.find("h2").find("a", {"class": "s-link"}).string
location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
# location
location = location.get_text(strip=True)
# industry
industry = industry.get_text(strip=True)
# link
link = content.find("h2").find("a", {"class": "s-link"})['href']
return {"company": company, "location": location, "industry": industry, "apply_link": f"https://stackoverflow.com{link}"}
# ํ์ฌ ๊ฐ์ ธ์ค๊ธฐ
def extract_companies(last_page, url):
companies = []
# last page์ ๊ฐ์๋งํผ ๋ฐฐ์ด ๋ง๋ค์ด์ for๋ฌธ ๋๋ฆฌ๊ธฐ
for page in range(last_page):
print(f"Scrapping SO: Page: {page}")
result = requests.get(f"{url}&pg={page + 1}")
soup = BeautifulSoup(result.text, "html.parser")
results = soup.find_all("div", {"class": "-company"})
for result in results:
company = extract_company(result)
companies.append(company)
return companies
def get_companies(word):
url = f"https://stackoverflow.com/jobs/companies?q={word}"
last_page = get_last_page(url)
companies = extract_companies(last_page, url)
return companies
๐ธ main.py
from flask import Flask, render_template, request, redirect
from scrapper import get_companies
app = Flask("SuperScrapper")
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
comps = get_companies(word)
print(comps)
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html", searchingBy=word, color="RED")
app.run(host="0.0.0.0")
์ด์ ๊ฒ์์ ํด์ฃผ๋ฉด ํด๋น ๋จ์ด๋ก scrappingํด์ค ๊ฒฐ๊ณผ๊ฐ print๋๋ค.
๐น console
crapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
[{'company': 'Branding Brand', 'location': 'Pittsburgh', 'industry': 'eCommerce, Headless Technology, Mobile Development', 'apply_link': 'https://stackoverflow.com/jobs/companies/branding-brand?c=1Cp4WgLdIRYzS4I8&q=react'}, {'company': 'WBS Gruppe', 'location': 'Berlin', 'industry': 'Education, eLearning, Online-Coaching', 'apply_link': 'https://stackoverflow.com/jobs/companies/wbs-gruppe?c=L5ITLrEQ7vTvO9BC&q=react'}, {'company': 'Amaris.AI', 'location': 'Singapore', 'industry': 'Artificial Intelligence, Consulting, Cybersecurity', 'apply_link': 'https://stackoverflow.com/jobs/companies/amarisai__suspended?c=NnKFnOhHU7mvFjJC&q=react'}, {'company': 'Pragmateam', 'location': 'Sydney; Porto Alegre; Gold Coast', 'industry': 'Product Development, Software Development / Engineering', 'apply_link': 'https://stackoverflow.com/jobs/companies/pragmateam?c=KEeyy1hjI6DTPRE4&q=react'}, {'company': 'Night Market', 'location': 'New York; Los Angeles; Toronto', 'industry': 'Advertising Technology, Data & Analytics, Media', 'apply_link': 'https://stackoverflow.com/jobs/companies/night-market?c=ODry4n2QaXBorfwY&q=react'}, {'company': 'EVS llc', 'location': 'Westminster; Del Mar', 'industry': 'Inventory Management Software, Supply Chain Management Software, Warehouse Management Software (WMS)', 'apply_link': 'https://stackoverflow.com/jobs/companies/evs-llc?c=OA83Of1Loao4UtDq&q=react'}, {'company': 'Hubble Pte Ltd', 'location': 'Singapore', 'industry': '3D Models, Construction, Information Technology', 'apply_link': 'https://stackoverflow.com/jobs/companies/hubble-pte-ltd?c=OEw1V06PTVhZKYcE&q=react'}, {'company': 'Paradox Cat GmbH', 'location': 'Ingolstadt; Mรผnchen', 'industry': 'Automotive, Computer Graphics, Project Management', 'apply_link': 'https://stackoverflow.com/jobs/companies/paradox-cat-ltd?c=HVGSQCvPD9snKwlG&q=react'}, {'company': 'AMBOSS ', 'location': 'Kรถln; Berlin; New York', 'industry': 'Education Technology, Healthcare, Medical', 'apply_link': 'https:/
... ์๋ต
# 4.5 Faster Scrapper
fake db ๋ฅผ ๋ง๋ค์ด์ฃผ๊ณ , ๊ฒ์๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ๋ค.
์ด์ ์ ๊ฒ์ํ ๊ฒฐ๊ณผ๊ฐ ์์ผ๋ฉด fake db์์ ์ฐพ์์ฃผ๊ณ , ์์ผ๋ฉด ์๋ก ๊ฒ์ํ์ฌ fake db์ ๋ฃ์ด์ค๋ค.
๐ธ main.py
from flask import Flask, render_template, request, redirect
from scrapper import get_companies
app = Flask("SuperScrapper")
# fake db
db = {}
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
# db ์ ์๋์ง ํ์ธ
fromDb = db.get(word)
if fromDb:
comps = fromDb
else:
comps = get_companies(word)
db[word] = comps
print(jobs)
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html",
searchingBy=word,
resultsNumber=len(comps)
)
app.run(host="0.0.0.0")
๐ธ report.html
<!DOCTYPE html>
<html>
<head>
<title>Job Search</title>
</head>
<body>
<h1>Search Results</h1>
<h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
</form>
</body>
</html>
๐น web
# 4.6 Rendering Jobs!
flask html
์์ python
์ ์์ฑํ๊ธฐ ์ํด์๋ ์๋์ ๊ฐ์ด ํ๋ฉด ๋๋ค.
css grid๋ฅผ ํ์ฉํ์ฌ ๊ฐ์ ธ์จ companies๋ฅผ ํ ์ด๋ธ ํ์์ผ๋ก ๋ฟ๋ ค์ฃผ์๋ค.
๐ธ main.py
from flask import Flask, render_template, request, redirect
from scrapper import get_companies
app = Flask("SuperScrapper")
# fake db
db = {}
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
# db ์ ์๋์ง ํ์ธ
existingComps = db.get(word)
if existingComps:
comps = existingComps
else:
comps = get_companies(word)
db[word] = comps
print(comps)
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html",
searchingBy=word,
resultsNumber=len(comps),
comps=comps
)
app.run(host="0.0.0.0")
๐ธ report.html
<!DOCTYPE html>
<html>
<head>
<title>Job Search</title>
<style>
section {
display: grid;
gap: 20px;
grid-template-columns: repeat(4, 1fr);
}
</style>
</head>
<body>
<h1>Search Results</h1>
<h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
<section>
<h4>company</h4>
<h4>location</h4>
<h4>industry</h4>
<h4>apply_link</h4>
{% for comp in comps%}
<span>{{comp.company}}</span>
<span>{{comp.location}}</span>
<span>{{comp.industry}}</span>
<a href="{{comp["apply_link"]}}">Apply</a>
{% endfor %}
</section>
</form>
</body>
</html>
๐น web
# 4.7 Export Route
๋ฒํผ์ ๋ง๋ค์ด์ csv
๋ก export ํด๋ณด์.
๐ธ main.py
from flask import Flask, render_template, request, redirect
from scrapper import get_companies
app = Flask("SuperScrapper")
# fake db
db = {}
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
# db ์ ์๋์ง ํ์ธ
existingComps = db.get(word)
if existingComps:
comps = existingComps
else:
comps = get_companies(word)
db[word] = comps
print(comps)
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html",
searchingBy=word,
resultsNumber=len(comps),
comps=comps
)
@app.route("/export")
def export():
# try exception ์ฌ์ฉ (exception ๋ฐ์ํ๋ฉด except๋ก ๊ฐ)
try:
word = request.args.get('word') # ๊ฒ์์ด ์์ผ๋ฉด ์๋ฌ
if not word:
raise Exception()
word = word.lower()
comps = db.get(word)
if not comps: #db์ ์์ผ๋ฉด ์๋ฌ
raise Exception()
return f"Genearte CSV for {word}"
except:
return redirect('/')
app.run(host="0.0.0.0")
๐ธ report.html
<!DOCTYPE html>
<html>
<head>
<title>Job Search</title>
<style>
section {
display: grid;
gap: 20px;
grid-template-columns: repeat(4, 1fr);
}
</style>
</head>
<body>
<h1>Search Results</h1>
<h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
<a href="/export?word={{searchingBy}}">Export to CSV</a>
<section>
<h4>company</h4>
<h4>location</h4>
<h4>industry</h4>
<h4>apply_link</h4>
{% for comp in comps%}
<span>{{comp.company}}</span>
<span>{{comp.location}}</span>
<span>{{comp.industry}}</span>
<a href="{{comp["apply_link"]}}">Apply</a>
{% endfor %}
</section>
</form>
</body>
</html>
๐น web
# 4.8 File Download
์ด์ scrapper (opens new window) ์์ save.py
๋ฅผ ๊ฐ์ ธ์จ๋ค.
exporter.py
๋ฅผ ์๋ก ๋ง๋ค์ด์ ๋ณต๋ถํด์ฃผ์.
csv
ํ์ผ์ ๋ง๋ค๊ณ send_file
์ ์ฌ์ฉํ์ฌ ๋ค์ด๋ก๋๊น์ง ํด์ฃผ์.
๐ธ main.py
from flask import Flask, render_template, request, redirect, send_file
from scrapper import get_companies
from exporter import save_to_file_companies
app = Flask("SuperScrapper")
# fake db
db = {}
@app.route("/")
def home():
return render_template("potato.html")
@app.route("/report")
def report():
word = request.args.get('word')
if word: # word๊ฐ ์๋ ๊ฒฝ์ฐ ์๋ฌธ์๋ก ๋ณ๊ฒฝ
word = word.lower()
# db ์ ์๋์ง ํ์ธ
existingComps = db.get(word)
if existingComps:
comps = existingComps
else:
comps = get_companies(word)
db[word] = comps
print(comps)
else: # ์๋ ๊ฒฝ์ฐ ํ์ผ๋ก redirect
return redirect("/")
return render_template("report.html",
searchingBy=word,
resultsNumber=len(comps),
comps=comps
)
@app.route("/export")
def export():
# try exception ์ฌ์ฉ (exception ๋ฐ์ํ๋ฉด except๋ก ๊ฐ)
try:
word = request.args.get('word') # ๊ฒ์์ด ์์ผ๋ฉด ์๋ฌ
if not word:
raise Exception()
word = word.lower()
comps = db.get(word)
if not comps: #db์ ์์ผ๋ฉด ์๋ฌ
raise Exception()
save_to_file_companies(comps)
return send_file("companies.csv")
except:
return redirect('/')
app.run(host="0.0.0.0")
๐ธ export.py
import csv
def save_to_file_companies(companies):
file = open("companies.csv", mode="w")
writer = csv.writer(file)
# ํค๋์ค ์์ฑ
writer.writerow(["company", "location", "industry", "apply_link"])
for company in companies:
# dict์์ values๋ง ๊ฐ์ ธ์ค๋ฉด dict_values๊ฐ type์
# ๋ฐ๋ผ์ list๋ก cast ํด์ค๋ค
writer.writerow(list(company.values()))
return
๊ฒ์ ํ, export
ํด์ฃผ๋ฉด ํ์ผ์ด ๋ค์ด๋ก๋ ๋๋ ๊ฑธ ํ์ธํ ์ ์๋ค.
๐น csv
# Reference
Pythone์ผ๋ก ์น ์คํฌ๋ํผ ๋ง๋ค๊ธฐ (opens new window)
Python library (opens new window)
python-scrapper-replit (opens new window)
python-super-scrapper-replit (opens new window)