# ๐ŸŽˆ Python์œผ๋กœ ์›น ์Šคํฌ๋ž˜ํผ ๋งŒ๋“ค๊ธฐ (nomad coders/๋…ธ๋งˆ๋“œ์ฝ”๋”)

์‹ฌ์‹ฌํ•ด์„œ ๋“ค์–ด๋ณด๋Š” Python ๊ฐ•์˜ .. ๐Ÿ‘“

2022-04 ๊ฐ•์˜๋ฅผ ๋“ค์—ˆ์„ ๋•Œ, indeed ์‚ฌ์ดํŠธ ๋งˆํฌ์—… ๊ตฌ์กฐ๊ฐ€ ์ข€ ๋‹ฌ๋ผ์กŒ๊ณ , stackoverflow job ์‚ฌ์ดํŠธ๋Š” ์ฐพ์„ ์ˆ˜ ์—†์—ˆ๋‹ค,, ๋Œ€์‹  https://stackoverflow.com/jobs/companies ์‚ฌ์ดํŠธ๋ฅผ ํŒŒ์‹ฑํ•ด๋ณด๋ ค๊ณ  ํ•œ๋‹ค.

๋”ฐ๋ผ์„œ ๊ฐ•์˜์— ์ž‘์„ฑ๋œ ์ฝ”๋“œ์™€ ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์Œ!

์šฐ์„  ๊ฐ•์˜๋ฅผ ๋“ฃ๊ณ , ๋ฐ”๋€ HTML์„ ๋ฐฐ์šด๋Œ€๋กœ ์Šคํฌ๋žฉํ•‘ ํ•˜์˜€๋‹ค. ๊ฐ•์˜ ์†Œ์ œ๋ชฉ ๋ฝ‘์œผ๋ ค๊ณ  ์Šคํฌ๋žฉํ•‘ ํ™œ์šฉํ•˜์—ฌ ์•„๋ž˜์™€ ๊ฐ™์ด ์ œ๋ชฉ์„ ๋ฝ‘์•˜๋‹ค !

beautiful soup ์œ ์šฉํ•œ ๊ฑฐ ๊ฐ™๋‹ค. ์ž˜ ์จ๋ด์•ผ์ง€.

๐Ÿ”ธ py

import requests
from bs4 import BeautifulSoup

result = requests.get("https://nomadcoders.co/python-for-beginners/lobby")
soup = BeautifulSoup(result.text, 'html.parser')
titles = soup.find_all("span", {"class": "px-6 py-4 whitespace-nowrap text-sm leading-5 overflow-hidden font-medium flex items-center text-gray-400"})
for title in titles:
  print(title.text)

๐Ÿ”น console

#0.5 How to Ask for Help (02:00) 
#0.6 Code Python Online (03:08) 
#1.0 Data Types of Python (08:48) 
#1.1 Lists in Python (08:30) 
#1.2 Tuples and Dicts (06:33) 
...
์ค‘๋žต
...
#4.6 Rendering Jobs! (12:24) 
#4.7 Export Route (08:48) 
#4.8 File Download (05:21) 
#4.9 Recap (07:28) 
#4.10 Conclusions (02:56) 

# โšก 1. THEORY

repl.it (opens new window) ์—์„œ ๊ฐ„๋‹จํ•˜๊ฒŒ ํ…Œ์ŠคํŠธํ•  ์ˆ˜ ์žˆ๋‹ค.

# 1.0 Data Types of Python

๐Ÿ”ธ py

# python ๋ณ€์ˆ˜๋ช…์€  snakeCase๋กœ ์จ์ค€๋‹ค.
a_string = 'like this'
a_number = 3
a_float = 3.12
a_boolean = False # python์—์„œ๋Š” ์ฒซ๊ธ€์ž๋ฅผ ๋Œ€๋ฌธ์ž๋กœ ์จ์•ผํ•œ๋‹ค.
a_none = None # empty ์กด์žฌํ•˜์ง€ ์•Š๋Š”๋‹ค.

# ์ถœ๋ ฅ
print(type(a_number))
print(type(a_string))
print(type(a_none))

๐Ÿ”น console

<class 'int'>
<class 'str'>
<class 'NoneType'>

# 1.1 Lists in Python

๐Ÿ”ธ py

# mutable sequence (๋ณ€๊ฒฝ ๊ฐ€๋Šฅํ•œ ์‹œํ€€์Šค)
days = ["Mon", "tue", "Wed", "Thur", "Fri"]

print("------list------")
print(days)
print("Mon" in days)
print("Man" in days)
print(days[3])
print(type(days))

print("------append------")
days.append("Sat")
print(days)

print("------reverse------")
days.reverse()
print(days)

๐Ÿ”น console

------list------
['Mon', 'tue', 'Wed', 'Thur', 'Fri']
True
False
Thur
<class 'list'>
------append------
['Mon', 'tue', 'Wed', 'Thur', 'Fri', 'Sat']
------reverse------
['Sat', 'Fri', 'Thur', 'Wed', 'tue', 'Mon']

# 1.2 Tuples and Dicts

๐Ÿ”ธ py

# immutable sequence (๋ณ€๊ฒฝ ๋ถˆ๊ฐ€๋Šฅํ•œ ์‹œํ€€์Šค)
days = ("Mon", "tue", "Wed", "Thur", "Fri")

print("------tuple------")
print(days)
print(type(days))

print("------dictionary------")
nico = {
  "name": "Nico",
  "age": 29,
  "korean": True,
  "fav_food": ["Kimchi", "Sashimi"]
}

print(type(nico))
print(nico)
nico["handsome"] = True
print(nico)

๐Ÿ”น console

------tuple------
('Mon', 'tue', 'Wed', 'Thur', 'Fri')
<class 'tuple'>
------dictionary------
<class 'dict'>
{'name': 'Nico', 'age': 29, 'korean': True, 'fav_food': ['Kimchi', 'Sashimi']}
{'name': 'Nico', 'age': 29, 'korean': True, 'fav_food': ['Kimchi', 'Sashimi'], 'handsome': True}

# 1.3 Built in Functions

๋‹ค์–‘ํ•œ Functions (opens new window)

๐Ÿ”ธ py

print("------len------")
print(len("fjsiodjfoisjf"))

print("------type------")
age = "18"
print(type(age))
n_age = int(age)
print(type(n_age))

๐Ÿ”น console

------len------
13
------type------
<class 'str'>
<class 'int'>

# 1.4 Creating a Your First Python Function

๐Ÿ”ธ py

# ๋“ค์—ฌ์“ฐ๊ธฐ๋ฅผ ํ•ด์ค˜์•ผ define๋จ !
# python์€ {}์œผ๋กœ ๊ตฌ๋ถ„ํ•˜์ง€ ์•Š๋Š”๋‹ค.
def say_hello(): 
  print("hello")
  print("bye")

say_hello()
say_hello()
say_hello()

๐Ÿ”น console

hello
bye
hello
bye
hello
bye

# 1.5 Function Arguments

๐Ÿ”ธ py

def say_hello(who): 
  print("hello", who)

say_hello("Nico")
say_hello(True)
# say_hello() - error!

print("------")

def plus(a, b):
  print(a + b)
def minus(a, b = 0): # default value
  print(a - b)
  
plus(2, 5)
minus(2)
minus(2, 5)

๐Ÿ”น console

hello Nico
hello True
------
7
2
-3

# 1.6 Returns

๐Ÿ”ธ py

def p_plus(a, b):
  print(a + b)
  
def r_plus(a, b):
  return a + b
  print("test") # return ์ดํ›„๋กœ ์‹คํ–‰๋˜์ง€ ์•Š์Œ

p_result = p_plus(2, 3)
r_result = r_plus(2, 3)

print(p_result, r_result)

๐Ÿ”น console

5
None 5

# 1.7 Keyworded Arguments

๐Ÿ”ธ py

# format
def say_hello(name, age):
  return f"Hello {name} you are {age} years old"

 # ์ธ์ž๊ฐ€ ๋ฐ”๋€Œ์–ด๋„ ๊ดœ์ฐฎ์Œ! ์ˆœ์„œ๊ฐ€ ์ƒ๊ด€์—†๋‹ค!
hello = say_hello(age = "12", name = "Nico")
print(hello)

๐Ÿ”น console

Hello Nico you are 12 years old

# 1.8 Code Challenge!

7๊ฐœ ์—ฐ์‚ฐ ๊ณ„์‚ฐ๊ธฐ ๋งŒ๋“ค๊ธฐ

๐Ÿ”ธ py

# 7๊ฐœ ์—ฐ์‚ฐ
def plus(a, b):
  return calc(a, b, '+')

def minus(a, b):
  return calc(a, b, '-')

def times(a, b):
  return calc(a, b, '*')

def division(a, b):
  return calc(a, b, '/')

def nega(a, b):
  return calc(a, b, 'nega')

def remain(a, b):
  return calc(a, b, '%')

def power(a, b):
  return calc(a, b, '**')
  
# validate and calcuate
def calc(a, b, func):
  try:
    a = float(a)
    b = float(b)
    try:
      if func == '+':
        return a + b
      elif func == '-':
        return a - b
      elif func == '*':
        return a * b
      elif func == '/':
        return a / b
      elif func == 'nega':
        return -a
      elif func == '**':
        return a ** b
    except:
      return 'Please check a method name.'
  except:
    return 'Please enter a number.'

print("------print(plus(3,'test'))------")
print(plus(3,'test'))
print("------print(minus(3,5))------")
print(minus(3,5))

๐Ÿ”น console

------print(plus(3,'test'))------
Please enter a number.

------print(minus(3,5))------
-2.0

# 1.9 Conditionals Part One

๐Ÿ”ธ py

def plus(a, b):
  if type(b) is int or type(b) is float:
    return a + b
  else:
    return None

print(plus(12, 1.2))
print(plus(12, "test"))

๐Ÿ”น console

13.2
None

# 1.10 if else and of

๐Ÿ”ธ py

def age_check(age):
  print(f"------you are {age}------")
  if age < 18:
    print("you cant drink")
  elif age == 18:
    print("you are new to this!")
  elif age > 20 and age < 25:
    print("you are still kind of young")
  else:
    print("enjoy your drink")
  print()

age_check(16)
age_check(18)
age_check(23)
age_check(30)

๐Ÿ”น console

------you are 16------
you cant drink

------you are 18------
you are new to this!

------you are 23------
you are still kind of young

------you are 30------
enjoy your drink

# 1.11 for in

๐Ÿ”ธ py

days = ("Mon", "Tue", "Wed", "Thur", "Fri")

for d in days:
  if d == 'Wed':
    break
  else:
    print(d)

for n in [1, 2, 3, 4, 5]:
  print(n)

for letter in "nicolas":
  print(letter)

๐Ÿ”น console

Mon
Tue
1
2
3
4
5
n
i
c
o
l
a
s

# 1.12 Modules

๐Ÿ”ธ main.py

# ํ•ญ์ƒ ์‚ฌ์šฉํ•  ๊ฒƒ๋งŒ ๊ฐ€์ ธ์˜ค๋„๋ก !
from math import ceil, fsum as sexy_sum
from calculator import plus 

print(ceil(1.2))
print(sexy_sum([1, 2, 3, 4, 5, 6, 7]))
print(plus(2, 3))

๐Ÿ”ธ calcuator.py

def plus(a, b):
  return a + b

๐Ÿ”น console

2
28.0
5

# โšก 2. BUILDING A JOB SCRAPPER

# 2.0~1 What is Web Scrapping~What are We Building

ํŒŒ์ด์ฌ์œผ๋กœ ์‚ฌ์ดํŠธ ์Šคํฌ๋žฉํ•‘ํ•˜๋Š” ๊ธฐ๋Šฅ์„ ๋งŒ๋“ค ์˜ˆ์ • -

indeed ํ•œ๊ตญ ๋ฒ„์ „์ด ์ƒ๊ฒจ์„œ ์ข€ ํ—ท๊ฐˆ๋ฆด ์ˆ˜ ์žˆ์œผ๋‹ˆ https://www.indeed.com/jobs?q=python&limit=50 ๋ฐ”๋กœ ์ด url๋กœ ์Šคํฌ๋žฉํ•‘ ํ•˜๋„๋ก ํ•  ๊ฒƒ !

stackOverflow job์€ ์‚ฌ๋ผ์ง„ ๊ฑฐ ๊ฐ™๋‹ค,, ๋‹ค๋ฅธ ์‚ฌ์ดํŠธ๋กœ ๋Œ€์ฒดํ•˜๋˜์ง€ ํ•ด์„œ ํ…Œ์ŠคํŠธ ํ•ด๋ณด์ž.

# 2.2 Navigating with Python

๐Ÿ“Œ ์šฐ์„  HTML ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์˜ค์ž.

python requests

https://docs.python-requests.org/en/latest/ ํ™œ์šฉํ•˜์—ฌ Http Requests ์ฐŒ๋ฅด๊ธฐ

repl.it์—์„œ ํŒจํ‚ค์ง€ ์„ค์น˜๋ฅผ ํ•ด์ฃผ์ž. requests๋ฅผ ๊ฒ€์ƒ‰ํ•˜์—ฌ Python HTTP for Humans ์„ค์น˜

r = requests.get('https://api.github.com/user', auth=('user', 'pass'))
r.status_code
200
r.headers['content-type']
'application/json; charset=utf8'
r.encoding
'utf-8'
r.text
'{"type":"User"...'
r.json()
{'private_gists': 419, 'total_private_repos': 77, ...}

๐Ÿ”ธ py

import requests

indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")

print(indeed_result.text)

๐Ÿ”น console

html ํ…์ŠคํŠธ๊ฐ€ ์ถœ๋ ฅ๋œ๋‹ค.

.
.
.
k\u0004My new jobs":[null,""],"job feed link\u0004There are new jobs":[null,""],"jobresults_tip_l_empty_variation_1\u0004Tip: Enter your city or zip code in the \"where\" box to show results in your area.":[null,"Tip: Enter your city or zip code in the \"where\" box to show results in your area."],"mobile_home_query_caption\u0004Job title, keywords, or company":[null,""],"near {0}":[null,""],"new count\u0004{0} new":[null,""],"new count / location separator\u0004in {0}":[null,""],"notice_message_for_empty_q_and_l\u0004Enter a job title or location to start a search":[null,""],"pill_filters\u0004All jobs":[null,""],"pill_filters\u0004Date Posted":[null,""],"pill_filters\u0004Last 14 days":[null,""],"pill_filters\u0004Last 24 hours":[null,""],"pill_filters\u0004Last 3 days":[null,""],"pill_filters\u0004Last 7 days":[null,""],"radius-slider-title\u0004Distance":[null,""],"recent search\u0004There is no recent search":[null,""],"recent search\u0004There is no recent search history.":[null,""],"recent search item\u0004go":[null,""],"recent search item\u0004{0} - {1}":[null,""],"recent_search_aria_label\u0004hide":[null,""],"recent_search_aria_label\u0004please tap the bottom of this page for back to search result.":[null,""],"recent_search_aria_label\u0004show":[null,""],"recent_search_ssr_label\u0004edit searches":[null,""],"recent_search_ssr_label\u0004finish":[null,""],"recent_searches_heading\u0004Search history / Saved Searches":[null,""],"rich search\u0004add filters":[null,""],"rich search\u0004dismiss":[null,""],"search":[null,""],"single search\u0004change query":[null,""],"{0} miles":[null,""],"{0} search suggestion":["{0} search suggestions","",""]};}).bind(this.mosaic.i18nOverrides)();
</script>
<script>window['sendPageLoadEndPing'] = function(pageId, tk, st) {var validPageIds = ['viewjob', 'serp']; if (!!Image && validPageIds.indexOf(pageId) > -1 && !!tk && !!st) {var href = '/rpc/pageLoadEnd?pageId=' + pageId + '&tk=' + tk + '&st=' + st + '&__=' + Math.random(); var img = new Image(); img.src = href;}}; window['sendPageLoadEndPing']("serp", "1g17eob8vghqc800", "1650591542559");</script><div class="mosaic-zone" id="mosaic-zone-serpBottomBody"><div id="mosaic-provider-signinprompt" class="mosaic mosaic-provider-signinprompt mosaic-rst"></div><div id="mosaic-provider-dislike-feedback" class="mosaic mosaic-provider-dislike-feedback"><div class="animatedToast i-unmask"><div class=""></div></div></div></div><script type="text/javascript">
                try {
                    window.mosaic.onMosaicApiReady(function() {
                        var zoneId = 'serpBottomBody';
                        var providers = window.mosaic.zonedProviders[zoneId];

                        if (providers) {
                            providers.filter(function(p) { return window.mosaic.lazyFns[p]; }).forEach(function(p) {
                                return window.mosaic.api.loadProvider(p);
                            });
                        }
                    });
                 } catch (e) {};
                </script></body>
</html>

๐Ÿ“Œ beautiful soup ์„ ํ™œ์šฉํ•˜์—ฌ ํ•„์š”ํ•œ ์ •๋ณด๋งŒ ๊ฐ€์ ธ์˜ค์ž

BeautifulSoup

https://www.crummy.com/software/BeautifulSoup/bs4/doc/

repl.it์—์„œ ํŒจํ‚ค์ง€ ์„ค์น˜๋ฅผ ํ•ด์ฃผ์ž. beautifulsoup4 ๊ฒ€์ƒ‰ํ•˜์—ฌ ์„ค์น˜

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())
soup.title
# <title>The Dormouse's story</title>

soup.title.name
# u'title'

soup.title.string
# u'The Dormouse's story'

soup.title.parent.name
# u'head'

soup.p
# <p class="title"><b>The Dormouse's story</b></p>

soup.p['class']
# u'title'

soup.a
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

soup.find_all('a')
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#  <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.find(id="link3")
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

# 2.3 Extracting Indeed Pages

ํŽ˜์ด์ง• ๋ฒˆํ˜ธ๊ฐ€ <div class="pagination">...๋กœ ์ด๋ฃจ์–ด์ ธ์žˆ๋‹ค. ํ•ด๋‹น ๋‚ด์šฉ ํŒŒ์‹ฑํ•ด๋ณด์ž.

๐Ÿ”ธ py

import requests
from bs4 import BeautifulSoup

indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")

indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser')

# ์ „์ฒด ์ถœ๋ ฅ
# print(indeed_soup)

# <title>
# print(indeed_soup.title)

# ํŽ˜์ด์ง• div ์ฐพ๊ธฐ
pagination = indeed_soup.find("div", {"class": "pagination"})

# a link ์ฐพ๊ธฐ
pages = pagination.find_all('a')

# <span class="pn">๋ฒˆํ˜ธ</span> ์ฐพ๊ธฐ

spans = []

for page in pages:
  spans.append(page.find("span"))

print("---------span/class:pn---------")
print(spans)
print("---------List์—์„œ ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ item---------")
 # -1: ๋งˆ์ง€๋ง‰์—์„œ๋ถ€ํ„ฐ ์‹œ์ž‘ํ•ด์„œ ์ฒซ item
print(spans[-1])
print("---------List์—์„œ ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ item ๋นผ๊ณ  ์กฐํšŒ---------")
# ๋งˆ์ง€๋ง‰ ์•„์ดํ…œ ๋นผ๊ณ  ์กฐํšŒ
# spans[0:5] 0๋ถ€ํ„ฐ 5๊ฐœ ์กฐํšŒ
print(spans[:-1])

๐Ÿ”น console

---------span/class:pn---------
[<span class="pn">2</span>, <span class="pn">3</span>, <span class="pn">4</span>, <span class="pn">5</span>, <span class="pn"><span class="np"><svg fill="none" height="24" width="24"><path d="M10 6L8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6-6-6z" fill="#2D2D2D"></path></svg></span></span>]
---------List์—์„œ ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ item---------
<span class="pn"><span class="np"><svg fill="none" height="24" width="24"><path d="M10 6L8.59 7.41 13.17 12l-4.58 4.59L10 18l6-6-6-6z" fill="#2D2D2D"></path></svg></span></span>
---------List์—์„œ ๊ฐ€์žฅ ๋งˆ์ง€๋ง‰ item ๋นผ๊ณ  ์กฐํšŒ---------
[<span class="pn">2</span>, <span class="pn">3</span>, <span class="pn">4</span>, <span class="pn">5</span>]

# 2.4 Extracting Indeed Pages part Two

pages.append(link.("span").string) ๋Š” pages.append(link.string) ์™€ ๊ฐ™๋‹ค.
beautiful soup ์—์„œ link > span ์•ˆ์— string์„ ์ฝ์–ด์ค€๋‹ค.

๐Ÿ”ธ py

import requests
from bs4 import BeautifulSoup

indeed_result = requests.get("https://www.indeed.com/jobs?q=python&limit=50")

indeed_soup = BeautifulSoup(indeed_result.text, 'html.parser')

# ํŽ˜์ด์ง• div ์ฐพ๊ธฐ
pagination = indeed_soup.find("div", {"class": "pagination"})

# a link ์ฐพ๊ธฐ
links = pagination.find_all('a')

# <span class="pn">๋ฒˆํ˜ธ</span> ์ฐพ๊ธฐ
pages = []
for link in links[:-1]:
  #pages.append(link.("span").string) ์™€ ๊ฐ™์Œ.
  # int๋กœ cast
  pages.append(int(link.string))

# html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€
max_page = pages[-1]
print(max_page)

๐Ÿ”น console

5

# 2.5 Requesting Each Page

๐Ÿ“Œ ๋ชจ๋“  ํŽ˜์ด์ง€๋ฅผ requestํ•ด๋ณด์ž.

ํŒŒ์ผ์„ ๋”ฐ๋กœ ๋‘์–ด function์„ ์ •๋ฆฌํ•ด์คฌ๋‹ค.

๐Ÿ”ธ main.py

from indeed import extract_indeed_pages, extract_indeed_jobs

# page ๋ฒˆํ˜ธ ์ „๋ถ€ ์กฐํšŒ ํ›„,
last_indeed_page = extract_indeed_pages()

# request ๋‚ ๋ ค๋ณด๊ธฐ status 200 ์„ฑ๊ณต ์ถœ๋ ฅ
extract_indeed_jobs(last_indeed_page)

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def extract_indeed_pages():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
#  range(N): N๊ฐœ์˜ ๋ฐฐ์—ด์„ ์ƒ์„ฑํ•ด์คŒ.
def extract_indeed_jobs(last_page):
  for page in range(last_page):
    result = requests.get(f"{URL}&start={page*LIMIT}")
    print(result.status_code)

๐Ÿ”น console

200
200
200
200
200

# 2.6 Extracting Titles

์‚ฌ์ดํŠธ ๋งˆํฌ์—…์ด ์ข€ ๋‹ฌ๋ผ์ ธ์„œ ๊ฐ•์˜๋ฅผ ๋“ฃ๊ณ , ๋‚ด ๋ฐฉ์‹๋Œ€๋กœ ์ˆ˜์ •ํ•˜์—ฌ title์„ ๊ฐ€์ ธ์™”๋‹ค.

์—ฌ๊ธฐ์„œ๋ถ€ํ„ฐ ๊ฐ•์˜์™€ ์ฝ”๋“œ๊ฐ€ ์ข€ ๋‹ค๋ฅผ ์˜ˆ์ •,,

๐Ÿ”ธ py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def extract_indeed_pages():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
  jobs = []
  # 1ํŽ˜์ด์ง€๋กœ ํ…Œ์ŠคํŠธํ•˜๊ธฐ ์œ„ํ•ด for๋ฌธ ์ฃผ์„ ์ฒ˜๋ฆฌ
  #for page in range(last_page):
  result = requests.get(f"{URL}&start={0*LIMIT}")
  soup = BeautifulSoup(result.text, 'html.parser')
  results = soup.find_all("div", {"class": "job_seen_beacon"})
  for result in results:
    title = result.find("td", {"class": "resultContent"}).find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
    print(title) # title
  return jobs

๐Ÿ”น console

Python/Django Developer
Software Development (All Levels)
Python - Machine Learning SME
Coding teacher for teaching Scratch & Python
Python Developer
Python Developer
Fraud Modeler - Credit Cards / Banking
Logistics Analyst
Online Python Teacher
Python Engineer
Python developer
Python Developer
Analytic Methodologist
Data Scientist with Python
Remote Python Developer
Remote Python Developer
Python Developer Ex Google
Python Developer
3D Solutions Analyst (REMOTE)
Entry Level Software Engineer
Backend Engineer (Python)
Entry Level Python Developer
Data Analyst
Lead Python Developer
Python Developer
Python Developer
Python developer
Data Scientist
AWS Python Developer
Data Analyst: Technical Business Intelligence
USSTRATCOM - Analytic Methodologist
Informatica for Google BigQuery
MACHINE LEARNING DATA SCIENTIST - PYTHON AND R
Python Developer
Database Developer
Basketball Data Scientist (remote opportunity)
Python Developer
Python Developer with AWs
Python Developer
Junior Trader (Remote)
Python Developer
Jr. Software Engineer
Python Developer
Backend software engineer (python)
Python Developer
Software Developer โ€“ Entry Level
Python Engineer
Data Scientist
Python Developer
Python Developer

# 2.7 Extracting Companies

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def extract_indeed_pages():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
  jobs = []
  # 1ํŽ˜์ด์ง€๋กœ ํ…Œ์ŠคํŠธํ•˜๊ธฐ ์œ„ํ•ด for๋ฌธ ์ฃผ์„ ์ฒ˜๋ฆฌ
  #for page in range(last_page):
  result = requests.get(f"{URL}&start={0*LIMIT}")
  soup = BeautifulSoup(result.text, 'html.parser')
  results = soup.find_all("div", {"class": "job_seen_beacon"})
  for result in results:
    resultContent = result.find("td", {"class": "resultContent"})
    # title
    title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
    # company
    company = resultContent.find("div", {"class":"company_location"}).find("span", {"class": "companyName"}).string
    print(f"{title} >>> {company}")
  return jobs

๐Ÿ”น console

Software Development (All Levels) >>> Insurity
Entry Level Software Engineer >>> Avant
Python/Django Developer >>> Delta
Python - Machine Learning SME >>> Envision
Coding teacher for teaching Scratch & Python >>> YoumeCan Education Center
Python Developer >>> Mechlance INC
Python Developer >>> Mobile Mechanic
Fraud Modeler - Credit Cards / Banking >>> Acunor
Logistics Analyst >>> Lululemon
Software Engineer (Early Career) >>> Apple
Python Engineer >>> Highbrow-Tech
Online Python Teacher >>> YC Solutions Pvt. Ltd.
Python Developer >>> Integration Developer Network LLC
Python/Odoo ERP Developer >>> Novobi, LLC
Python developer >>> Vyze, Inc.
Remote Python Developer >>> Piper Companies
Software Engineer I ( 100% Remote) >>> Windstream Communications
Data Scientist with Python >>> Techladder
Analytic Methodologist >>> Constellation West
Remote Python Developer >>> CTI Consulting
Python Developer Ex Google >>> Laiba Technologies
3D Solutions Analyst (REMOTE) >>> Under Armour
Entry Level Python Developer >>> Marlabs
Backend Engineer (Python) >>> Metaverse HQ
Python Developer >>> WorkCog inc
Data Analyst >>> Young Life
MACHINE LEARNING DATA SCIENTIST - PYTHON AND R >>> InspiHER Tech
Lead Python Developer >>> Interaslabs.llc
Python Developer >>> Yaddala Consulting
Data Analyst: Technical Business Intelligence >>> Barstool Sports
USSTRATCOM - Analytic Methodologist >>> Apogee Engineering, LLC
AWS Python Developer >>> Inscope Global Solutions
Data Scientist >>> FIIDUS
Informatica for Google BigQuery >>> Kaygen
Python Developer >>> Business Intelli Solutions
Python Developer >>> AGM Tech Solutions, INC.
Database Developer >>> Integration Developer Network LLC
Python developer >>> Stefanini Group
Python Developer >>> Aquatic Capital Management
Python Developer >>> Santex Group
Python Developer with AWs >>> Innovative BI Solutions Inc
Basketball Data Scientist (remote opportunity) >>> Madison Square Garden Entertainment
Jr. Software Engineer >>> NBCUniversal
Backend software engineer (python) >>> Benchmark IT Solutions
Python Developer >>> Swan Software Solutions
Python Developer >>> Benedsoft
Software Developer โ€“ Entry Level >>> Grant Street Group
Python Developer >>> Infinizi IT Solutions Pvt. Ltd.
Python Engineer >>> Techno corporation inc
Python Developer >>> Morgan Stanley

# 2.8 Extracting Locations and Finishing up

id๊นŒ์ง€ ๊ฐ€์ ธ์˜ค๋„๋ก ํŒŒ์‹ฑํ•˜๊ธฐ

https://www.indeed.com/viewjob?jk={id} ์™€ ๊ฐ™์ด ๋งํฌ ์ƒ์„ฑ ๊ฐ€๋Šฅ!

๐Ÿ”ธ main.py

from indeed import extract_indeed_pages, extract_indeed_jobs

last_indeed_page = extract_indeed_pages()

indeed_jobs = extract_indeed_jobs(last_indeed_page)

print(indeed_jobs)

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def extract_indeed_pages():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ํšŒ์‚ฌ ์ •๋ณด ํŒŒ์‹ฑ
def extract_job(html):
  resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
  # id
  id = html["data-jk"]
  # title
  title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
  # company
  company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
  # location
  location = resultContent.find("div", {"class": "companyLocation"}).string
  
  return {'id': id, 'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_indeed_jobs(last_page):
  jobs = []
  # ๋ชจ๋“  ํŽ˜์ด์ง€ ์ง์—… ์กฐํšŒ
  for page in range(last_page):
    print(f"Scrapping page {page}")
    result = requests.get(f"{URL}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text, 'html.parser')
    results = soup.find_all("a", {"class": "tapItem"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

๐Ÿ”น console

Scrapping page 0
Scrapping page 1
Scrapping page 2
Scrapping page 3
Scrapping page 4
[{'id': 'b6d26975703d41c2', 'title': 'Python - Machine Learning SME', 'company': 'Envision', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=b6d26975703d41c2'}, {'id': '5a91a49780ab17df', 'title': 'Sr Data Scientist', 'company': 'Zillow', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=5a91a49780ab17df'}, {'id': '2bcd843c58159429', 'title': 'Software Engineer (Early Career)', 'company': 'Apple', 'location': None, 'link': 'https://www.indeed.com/viewjob?jk=2bcd843c58159429'}, {'id': '6d0ab231885eac14', 'title': 'GIS Analyst', 'company': 'Bruce Harris & Associates, Inc', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=6d0ab231885eac14'}, {'id': '788dc9dd8ade27a6', 'title': 'Python/Django Developer', 'company': 'Delta', 'location': None, 'link': 'https://www.indeed.com/viewjob?jk=788dc9dd8ade27a6'}, {'id': 'ebcc2ad72eda66fb', 'title': 'QT Software Engineer (Python and C++)', 'company': 'TriSearch', 'location': 'Remote', 'link': 'https://www.indeed.com/viewjob?jk=ebcc2ad72eda66fb'}, {'id': '57654f0e7ccfc3b7', 'title': ' ... ์ƒ๋žต

# 2.9 StackOverflow Pages

https://stackoverflow.com/jobs/companies?q=python ๋ฅผ ํŒŒ์‹ฑํ•ด๋ณด์ž ! python์œผ๋กœ ๊ฒ€์ƒ‰ํ•˜๋Š” url์ด๋‹ค.

์šฐ์„  stackoverflow์™€ ๊ตฌ๋ถ„ํ•˜๊ธฐ ์œ„ํ•ด์„œ indeed.py, so.py ๋กœ ๋‚˜๋ˆ ์ฃผ์—ˆ๊ณ , main.py์ฝ”๋“œ๋ฅผ ์ข€ ์ •๋ฆฌํ•˜์˜€๋‹ค.

so.py๋ฅผ ํ…Œ์ŠคํŠธํ•˜๊ธฐ ์œ„ํ•ด ํ…Œ์ŠคํŠธ๋ฅผ ์œ„ํ•ด ๊ธฐ์กด indeed ํŒŒ์‹ฑ ๋ฉ”์†Œ๋“œ๋Š” ์ฃผ์„์ฒ˜๋ฆฌ ํ•ด์ฃผ์—ˆ๊ณ , pagination์˜ a ํƒœ๊ทธ๋ฅผ find_all๋กœ ์šฐ์„  ๊ฐ€์ ธ์™”๋‹ค.

๐Ÿ”ธ main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs

#indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def get_last_page():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ํšŒ์‚ฌ ์ •๋ณด ํŒŒ์‹ฑ
def extract_job(html):
  resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
  # id
  id = html["data-jk"]
  # title
  title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
  # company
  company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
  # location
  location = resultContent.find("div", {"class": "companyLocation"}).string
  
  return {'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
  jobs = []
  # ๋ชจ๋“  ํŽ˜์ด์ง€ ์ง์—… ์กฐํšŒ
  for page in range(last_page):
    print(f"Scrapping page {page}")
    result = requests.get(f"{URL}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text, 'html.parser')
    results = soup.find_all("a", {"class": "tapItem"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

๐Ÿ”ธ so.py

import requests
from bs4 import BeautifulSoup

URL =  f"https://stackoverflow.com/jobs/companies?q=python"

def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
  print(pages)
  
def get_jobs():
  last_page = get_last_page()
  return []

๐Ÿ”น console

[<a class="s-pagination--item is-selected" href="/jobs/companies?q=python" title="page 1 of 21">
<span>1</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&amp;pg=2" title="page 2 of 21">
<span>2</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&amp;pg=3" title="page 3 of 21">
<span>3</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&amp;pg=4" title="page 4 of 21">
<span>4</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&amp;pg=21" title="page 21 of 21">
<span>21</span>
</a>, <a class="s-pagination--item" href="/jobs/companies?q=python&amp;pg=2" title="page 2 of 21">
<span>next</span><i class="material-icons">chevron_right</i>
</a>]

# 2.10 StackOverflow extract jobs

strip

get_text(strip=True) ๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด text๋ฅผ ๊ฐ€์ ธ์˜ด๊ณผ ๋™์‹œ์— ์•ž๋’ค ๊ณต๋ฐฑ์„ ์ž˜๋ผ์ค€๋‹ค.
์ฐธ๊ณ  (opens new window)

markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>'
soup = BeautifulSoup(markup, 'html.parser')

soup.get_text()
'\nI linked to example.com\n'
soup.i.get_text()
'example.com'

soup.get_text("|", strip=True)
'I linked to|example.com'

๐Ÿ”ธ so.py

import requests
from bs4 import BeautifulSoup

URL =  f"https://stackoverflow.com/jobs/companies?q=python"

# ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
  # ๋งˆ์ง€๋ง‰(-1)์€ next๋ฒ„ํŠผ์ด๋ฏ€๋กœ ๋งˆ์ง€๋ง‰์—์„œ 2๋ฒˆ์งธ๊ฑฐ(-2)๊ฐ€ last page
  # strip=True๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์•ž๋’ค ๊ณต๋ฐฑ ์ž๋ฅด๊ธฐ
  last_page = pages[-2].get_text(strip=True) 
  return int(last_page)

# ํšŒ์‚ฌ ๊ฐ€์ ธ์˜ค๊ธฐ
def extract_companies(last_page):
  companies = []
  # last page์˜ ๊ฐœ์ˆ˜๋งŒํผ ๋ฐฐ์—ด ๋งŒ๋“ค์–ด์„œ for๋ฌธ ๋Œ๋ฆฌ๊ธฐ
  for page in range(last_page):
    result = requests.get(f"{URL}&pg={page + 1}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class": "-company"})
    for result in results:
      print(result.find("div", {"class": "dismiss-trigger"})["data-id"])
   
def get_jobs():
  last_page = get_last_page()
  companies = extract_companies(last_page)
  return companies

๐Ÿ”น console

31152
17914
26760
32154
3060
...
์ค‘๋žต
...
32169
4603
32176
23691
20917

# 2.11~12 StackOverflow extract job

ํšŒ์‚ฌ ์ •๋ณด๋ฅผ ๊ฐ€์ ธ์™€ ํŒŒ์‹ฑํ•ด์ฃผ์—ˆ๋‹ค.

recursive

find_all("title", recursive=False)๋ฅผ ์‚ฌ์šฉํ•˜๋ฉด ์ฒซ๋‹จ๊ณ„๋งŒ ์ฐพ๊ณ , ๊ทธ ์•ˆ์— ๊นŠ์ˆ™ํ•œ ํƒœ๊ทธ๋Š” ์ฐพ์ง€ ์•Š๋Š”๋‹ค.
์ฐธ๊ณ  (opens new window)

soup.html.find_all("title")
# [<title>The Dormouse's story</title>]

soup.html.find_all("title", recursive=False)
# []

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

URL =  f"https://stackoverflow.com/jobs/companies?q=python"

# ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
  # ๋งˆ์ง€๋ง‰(-1)์€ next๋ฒ„ํŠผ์ด๋ฏ€๋กœ ๋งˆ์ง€๋ง‰์—์„œ 2๋ฒˆ์งธ๊ฑฐ(-2)๊ฐ€ last page
  # strip=True๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์•ž๋’ค ๊ณต๋ฐฑ ์ž๋ฅด๊ธฐ
  last_page = pages[-2].get_text(strip=True) 
  return int(last_page)

def extract_company(html):
  content = html.find("div", {"class": "flex--item fl1 text mb0"})
  # company
  company = content.find("h2").find("a", {"class": "s-link"}).string

  location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
  # location
  location = location.get_text(strip=True)
  # industry
  industry = industry.get_text(strip=True)
  print(location, industry)
  return {"company": company, "location": location, "industry": industry}
  
  
# ํšŒ์‚ฌ ๊ฐ€์ ธ์˜ค๊ธฐ
def extract_companies(last_page):
  companies = []
  # last page์˜ ๊ฐœ์ˆ˜๋งŒํผ ๋ฐฐ์—ด ๋งŒ๋“ค์–ด์„œ for๋ฌธ ๋Œ๋ฆฌ๊ธฐ
  for page in range(last_page):
    result = requests.get(f"{URL}&pg={page + 1}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class": "-company"})
    for result in results:
      company = extract_company(result)
      companies.append(company)
  return companies
  
def get_jobs():
  last_page = get_last_page()
  companies = extract_companies(last_page)
  return companies

๐Ÿ”น console

Edinburgh; Beirut; Bozeman Cloud Computing, Education Technology, SaaS
Dublin 1 Agile Software Development, Cloud-Based Solutions, Computer Software
Mรผnchen Computer Vision, Image Guided Surgery, Medical Imaging
United States Cybersecurity, Healthcare
Elkridge; Linthicum Heights; Vienna Computer Software
...
์ค‘๋žต
...
No office location Retail, Technical Services, Web Technology
Fulton Business to Business, Security Software
No office location Bioinformatics, Computer Software, Digital Health
No office location Agile Software Development, Software Development / Engineering, Technology Staffing
Berlin Agile Software Development, Automotive

# 2.13 StackOverflow Finish

indeed์™€ stackoverflow ์—์„œ ํŒŒ์‹ฑํ•œ ๊ฒƒ๋“ค์„ ํ•ฉ์ณ์ฃผ์ž.

๐Ÿ”ธ main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs

indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()

jobs = so_jobs + indeed_jobs

๊ฐ๊ฐ scrapping ์„ฑ๊ณต์—ฌ๋ถ€๋ฅผ ํ™•์ธํ•˜๊ธฐ ์œ„ํ•ด ์•„๋ž˜์™€ ๊ฐ™์ด for๋ฌธ์•ˆ์— printํ•ด์ฃผ์ž

๐Ÿ”ธ indeed.py

.
.
.
# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
  jobs = []
  # ๋ชจ๋“  ํŽ˜์ด์ง€ ์ง์—… ์กฐํšŒ
  for page in range(last_page):
    print(f"Scrapping ID: Page: {page}")
.
.
.

๐Ÿ”ธ so.py

.
.
.
# ํšŒ์‚ฌ ๊ฐ€์ ธ์˜ค๊ธฐ
def extract_companies(last_page):
  companies = []
  # last page์˜ ๊ฐœ์ˆ˜๋งŒํผ ๋ฐฐ์—ด ๋งŒ๋“ค์–ด์„œ for๋ฌธ ๋Œ๋ฆฌ๊ธฐ
  for page in range(last_page):
    print(f"Scrapping SO: Page: {page}")
.
.
.

๐Ÿ”น console

Scrapping ID: Page: 0
Scrapping ID: Page: 1
Scrapping ID: Page: 2
Scrapping ID: Page: 3
Scrapping ID: Page: 4
Scrapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
Scrapping SO: Page: 19
Scrapping SO: Page: 20

# 2.14 What is CSV

CSV : Comma Separated Values

  • vsCode ์—์„œ ExcelViewer ํ”Œ๋Ÿฌ๊ทธ์ธ์„ ์„ค์น˜ํ•œ๋‹ค.

vuepress

  • we.csv ํŒŒ์ผ์„ ์ƒ์„ฑํ•œ๋‹ค.
name, last Name, age, gender
nico, serrano, 12, male
nico, serrano, 12, male
nico, serrano, 12, male
  • we.csv ํŒŒ์ผ์„ vsCode์—์„œ preview๋กœ ์—ด์–ด๋ณธ๋‹ค.

vuepress

  • google spreadsheet์—์„œ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด๋ณธ๋‹ค.

vuepress

์ž„์˜๋กœ ์•„๋ž˜์™€ ๊ฐ™์ด save.py ํŒŒ์ผ ์ƒ์„ฑ

๐Ÿ”ธ main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file

indeed_jobs = get_indeed_jobs()
so_jobs = get_so_jobs()

jobs = so_jobs + indeed_jobs
save_to_file(jobs)

๐Ÿ”ธ save.py

import csv

def save_to_file(jobs):
  return 

# 2.15 Saving to CSV

๐Ÿ”ธ main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_jobs as get_so_jobs
from save import save_to_file

indeed_jobs = get_indeed_jobs()
#so_jobs = get_so_jobs()

jobs = indeed_jobs
save_to_file(jobs)

๐Ÿ”ธ save.py

import csv

def save_to_file(jobs):
  file = open("jobs.csv", mode="w")
  writer = csv.writer(file)
  # ํ—ค๋”์ค„ ์ƒ์„ฑ
  writer.writerow(["title", "company", "location", "link"])
  for job in jobs:
    # dict์—์„œ values๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด dict_values๊ฐ€ type์ž„
    # ๋”ฐ๋ผ์„œ list๋กœ cast ํ•ด์ค€๋‹ค
    writer.writerow(list(job.values()))
  return 

๐Ÿ”น jobs.csv

title,company,location,link
Remote Python Developer,CTI Consulting,,https://www.indeed.com/viewjob?jk=75422ff0a5cfbe28
Python Developer,Aquatic Capital Management,"Remote in Chicago, IL",https://www.indeed.com/viewjob?jk=5bdb3c2265c60c4a
Senior Python Developer,Gallup,,https://www.indeed.com/viewjob?jk=b3a32bc1a87689e7
C++/Python Developer,FIIDUS,Remote,https://www.indeed.com/viewjob?jk=3720520a9b3f386f
Informatica for Google 
.
.
.

# 2.16 OMG THIS IS AWESOME

๋งˆ์ง€๋ง‰์œผ๋กœ ์ฝ”๋“œ ์ •๋ฆฌ๋ฅผ ํ•˜๋ฉด ์•„๋ž˜์™€ ๊ฐ™์ด ๋˜๋ฉฐ, csv ํŒŒ์ผ ๋‘๊ฐœ๊ฐ€ ์ƒ์„ฑ๋˜๋Š” ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.

python_scrapper_replit (opens new window)

๐Ÿ”ธ main.py

from indeed import get_jobs as get_indeed_jobs
from so import get_companies as get_so_companies
from save import save_to_file_jobs, save_to_file_companies

indeed_jobs = get_indeed_jobs()
so_companies = get_so_companies()

save_to_file_jobs(indeed_jobs)
save_to_file_companies(so_companies)

๐Ÿ”ธ indeed.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50
URL = f"https://www.indeed.com/jobs?q=python&limit={LIMIT}"

# pagination ์ค‘ ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
def get_last_page():
  result = requests.get(URL)
  
  soup = BeautifulSoup(result.text, 'html.parser')
  
  # html๋‚ด ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ์ฐพ๊ธฐ
  pagination = soup.find("div", {"class": "pagination"})
  
  links = pagination.find_all('a')
  
  pages = []
  for link in links[:-1]:
    pages.append(int(link.string))
  
  max_page = pages[-1]
  return max_page

# ํšŒ์‚ฌ ์ •๋ณด ํŒŒ์‹ฑ
def extract_job(html):
  resultContent = html.find("div", {"class": "job_seen_beacon"}).find("td", {"class": "resultContent"})
  # id
  id = html["data-jk"]
  # title
  title = resultContent.find("h2", {"class": "jobTitle"}).find("span", {"class": None})["title"]
  # company
  company = resultContent.find("div", {"class": "company_location"}).find("span", {"class": "companyName"}).string
  # location
  location = resultContent.find("div", {"class": "companyLocation"}).string
  
  return {'title': title, 'company': company, 'location': location, 'link': f"https://www.indeed.com/viewjob?jk={id}"}

# ๊ฐ page์˜ start index ๊ตฌํ•˜์—ฌ request ๋‚ ๋ ค๋ณด๊ธฐ
def extract_jobs(last_page):
  jobs = []
  # ๋ชจ๋“  ํŽ˜์ด์ง€ ์ง์—… ์กฐํšŒ
  for page in range(last_page):
    print(f"Scrapping ID: Page: {page}")
    result = requests.get(f"{URL}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text, 'html.parser')
    results = soup.find_all("a", {"class": "tapItem"})
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  return jobs

def get_jobs():
  last_page = get_last_page()
  jobs = extract_jobs(last_page)
  return jobs

๐Ÿ”ธ so.py

import requests
from bs4 import BeautifulSoup

URL =  f"https://stackoverflow.com/jobs/companies?q=python"

# ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
def get_last_page():
  result = requests.get(URL)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
  # ๋งˆ์ง€๋ง‰(-1)์€ next๋ฒ„ํŠผ์ด๋ฏ€๋กœ ๋งˆ์ง€๋ง‰์—์„œ 2๋ฒˆ์งธ๊ฑฐ(-2)๊ฐ€ last page
  # strip=True๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์•ž๋’ค ๊ณต๋ฐฑ ์ž๋ฅด๊ธฐ
  last_page = pages[-2].get_text(strip=True) 
  return int(last_page)

def extract_company(html):
  content = html.find("div", {"class": "flex--item fl1 text mb0"})
  # company
  company = content.find("h2").find("a", {"class": "s-link"}).string

  location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
  # location
  location = location.get_text(strip=True)
  # industry
  industry = industry.get_text(strip=True)
  
  # link
  link = content.find("h2").find("a", {"class": "s-link"})['href']

  return {"company": company, "location": location, "industry": industry, "apply_link": f"https://stackoverflow.com{link}"}
  
  
# ํšŒ์‚ฌ ๊ฐ€์ ธ์˜ค๊ธฐ
def extract_companies(last_page):
  companies = []
  # last page์˜ ๊ฐœ์ˆ˜๋งŒํผ ๋ฐฐ์—ด ๋งŒ๋“ค์–ด์„œ for๋ฌธ ๋Œ๋ฆฌ๊ธฐ
  for page in range(last_page):
    print(f"Scrapping SO: Page: {page}")
    result = requests.get(f"{URL}&pg={page + 1}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class": "-company"})
    for result in results:
      company = extract_company(result)
      companies.append(company)
  return companies
  
def get_companies():
  last_page = get_last_page()
  companies = extract_companies(last_page)
  return companies

๐Ÿ”ธ save.py

import csv

def save_to_file_jobs(jobs):
  file = open("jobs.csv", mode="w")
  writer = csv.writer(file)
  # ํ—ค๋”์ค„ ์ƒ์„ฑ
  writer.writerow(["title", "company", "location", "link"])
  for job in jobs:
    # dict์—์„œ values๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด dict_values๊ฐ€ type์ž„
    # ๋”ฐ๋ผ์„œ list๋กœ cast ํ•ด์ค€๋‹ค
    writer.writerow(list(job.values()))
  return 
  
def save_to_file_companies(companies):
  file = open("companies.csv", mode="w")
  writer = csv.writer(file)
  # ํ—ค๋”์ค„ ์ƒ์„ฑ
  writer.writerow(["company", "location", "industry", "apply_link"])
  for company in companies:
    # dict์—์„œ values๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด dict_values๊ฐ€ type์ž„
    # ๋”ฐ๋ผ์„œ list๋กœ cast ํ•ด์ค€๋‹ค
    writer.writerow(list(company.values()))
  return 

๐Ÿ”น console

Scrapping ID: Page: 0
Scrapping ID: Page: 1
Scrapping ID: Page: 2
Scrapping ID: Page: 3
Scrapping ID: Page: 4
Scrapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
Scrapping SO: Page: 19
Scrapping SO: Page: 20

๐Ÿ”น jobs.csv

title,company,location,link
"Security Engineer- AWS, Python",The Getch,Remote,https://www.indeed.com/viewjob?jk=5fba88b67d1b72dc
Python Developer,Paktolus,Remote,https://www.indeed.com/viewjob?jk=df1cce3cc988f374
Python Developer,Simplified IT Solutions,Remote,https://www.indeed.com/viewjob?jk=7e3a3e84485bb544
Python Developer,EMR CPR LLC,"Austin, TX",https://www.indeed.com/viewjob?jk=52298adb7d458010
Senior Python AWS Developer,DataAxxis,,https://www.indeed.com/viewjob?jk=cd01de5c4adc7ac4
Associate Solutions Architect โ€“ Early Career 2022 (US),"Amazon Web Services, Inc.",,https://www.indeed.com/viewjob?jk=3905541e1957ec4a
Python Developer,Oremda Infotech Inc.,Remote,https://www.indeed.com/viewjob?jk=2c04530f755a2932
Senior Software Engineer,University of Nebraska Medical Center,,https://www.indeed.com/viewjob?jk=067a8cf9dccbdc70
...์ƒ๋žต

๐Ÿ”น companies.csv

company,location,industry,apply_link
Administrate,Edinburgh; Beirut; Bozeman,"Cloud Computing, Education Technology, SaaS",https://stackoverflow.com/jobs/companies/administrate?c=MYHq0mvrMlWD3iKY&q=python
"Arista Networks, Inc",Dublin 1,"Agile Software Development, Cloud-Based Solutions, Computer Software",https://stackoverflow.com/jobs/companies/www-arista-com?
...์ƒ๋žต

# โšก 3. GET READY FOR DJANGO

# 3.0 Django is AWESOME

Django (opens new window) ์†Œ๊ฐœ !

# 3.1 *args **kwargs

Django๋Š” ๋ฌดํ•œ arguments(*args)๋ฅผ ์ค„ ์ˆ˜ ์žˆ๋‹ค. ํ•˜์ง€๋งŒ key=value์ธ argument๋ฅผ ์ฃผ๋ ค๋ฉด **kwargs๋ฅผ ์จ์•ผํ•œ๋‹ค. keyword arguments์˜ ์ถ•์•ฝ์–ด์ด๋‹ค.

๐Ÿ”ธ py

def plus(a, b, *args, **kwargs): 
  print(args)
  print(kwargs) # key=value๊ฐ’์€ keyword argument๋กœ ๋ฐ›์•„์•ผํ•จ
  return a + b

plus(1, 2, 3, 4, 5, 1, 2, 3, 4, 3, 4, 5, hello=True, bye=True)

๐Ÿ”น console

(3, 4, 5, 1, 2, 3, 4, 3, 4, 5)
{'hello': True, 'bye': True}

๋ฌดํ•œ ๊ณ„์‚ฐ๊ธฐ๋ฅผ ๋งŒ๋“ค๋ฉด ์•„๋ž˜์™€ ๊ฐ™๋‹ค.

๐Ÿ”ธ py

def plus(*args):
  result = 0
  for number in args:
    result += number
  print(result)

plus(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

๐Ÿ”น console

55

# 3.2 Intro to Object Oriented Programming

๐Ÿ”ธ py

# ์ฒญ์‚ฌ์ง„ (blueprint)
class Car():
  wheels = 4
  doors = 4
  windows = 4
  seats = 4

porche = Car() #instance
porche.color = "Red"
print(porche.windows, porche.color)

ferrari = Car()
ferrari.color = "Yellow"
print(ferrari.windows, ferrari.color)

mini = Car()
mini.color = "White"

๐Ÿ”น console

4 Red
4 Yellow

# 3.3 Methods part One

method

class ์•ˆ์— ์žˆ๋Š” function์„ method๋ผ๊ณ  ํ•œ๋‹ค.
์ „์—ญ์œผ๋กœ ์„ ์–ธ๋์„ ๊ฒฝ์šฐ์—๋Š” function / ํด๋ž˜์Šค ์•ˆ์— ์„ ์–ธ๋œ ๊ฒฝ์šฐ์—๋Š” method

python์€ ๋ชจ๋“  method์— 1๊ฐœ์˜ argument๋ฅผ ํ•„์ˆ˜๋กœ ๊ฐ–๋Š”๋‹ค.
method์˜ ์ฒซ๋ฒˆ์งธ argument๋Š” method๋ฅผ ํ˜ธ์ถœํ•˜๋Š” ์ž๊ธฐ ์ž์‹ , instance์ด๋‹ค.

๐Ÿ”ธ py

# ์ฒญ์‚ฌ์ง„ (blueprint)
class Car():
  wheels = 4
  doors = 4
  windows = 4
  seats = 4
  # method (ํด๋ž˜์Šค ์•ˆ์— ์žˆ์œผ๋ฉด method/๋ฐ–์ด๋ฉด function)
  def start(self):
    print(self.doors)
    print(self.color)
    print("I started")
    
porche = Car()
porche.color = "RED"
porche.start()

๐Ÿ”น console

4
RED
I started

# 3.4 Methods part Two

๐Ÿ”ธ py

# ์ฒญ์‚ฌ์ง„ (blueprint)
class Car():
  # init์œผ๋กœ ๋ฐ”๊ฟ”์ฃผ๋Š” ๊ฒŒ ๋ฐ”๋žŒ์งํ•จ
  def __init__(self, **kwargs):
    # print(kwargs)
    self.wheels = 4
    self.doors = 4
    self.windows = 4
    self.seats = 4
    # ๊ฐ’์ด ์—†์„ ๊ฒฝ์šฐ, ๋‘๋ฒˆ์งธ ์ธ์ž ํ• ๋‹น
    self.color = kwargs.get("color", "black")
    self.price = kwargs.get("price", "$20")
    
  # method override
  def __str__(self):
    return f"Car with {self.wheels} wheels"

# dir ํด๋ž˜์Šค ์•ˆ ๋ชจ๋“ ๊ฒƒ๋“ค์€ list๋กœ ๋ณด์—ฌ์ค€๋‹ค.
# print(dir(Car))

porche = Car(color="green", price="$40")
# porche๋ฅผ ํ˜ธ์ถœํ• ๋•Œ๋งˆ๋‹ค ๊ธฐ๋ณธ ๋ฉ”์†Œ๋“œ์ธ __str__์„ ํ˜ธ์ถœ
print(porche)
print(porche.color, porche.price)

mini = Car()
print(mini.color, mini.price)

๐Ÿ”น console

Car with 4 wheels
green $40
black $20

# 3.5 Extending Classes

๐Ÿ”ธ py

# ์ฒญ์‚ฌ์ง„ (blueprint)
class Car():
  # init์œผ๋กœ ๋ฐ”๊ฟ”์ฃผ๋Š” ๊ฒŒ ๋ฐ”๋žŒ์งํ•จ
  def __init__(self, **kwargs):
    # print(kwargs)
    self.wheels = 4
    self.doors = 4
    self.windows = 4
    self.seats = 4
    # ๊ฐ’์ด ์—†์„ ๊ฒฝ์šฐ, ๋‘๋ฒˆ์งธ ์ธ์ž ํ• ๋‹น
    self.color = kwargs.get("color", "black")
    self.price = kwargs.get("price", "$20")
    
  # method override
  def __str__(self):
    return f"Car with {self.wheels} wheels"

# extends Car class
class Convertible(Car):
  # ๋ถ€๋ชจ init์— ์ถ”๊ฐ€ ์ž‘์—…
  def __init__(self, **kwargs):
    super().__init__(**kwargs) # ๋ถ€๋ชจ ํด๋ž˜์Šค ํ˜ธ์ถœ
    self.time = kwargs.get("time", 10)
  # add method
  def take_off(self):
    return "taking off"
  # override
  def __str__(self):
    return f"Car with no roof"
    
porche = Convertible(color="green", price="$40")

mini = Car()
print(porche)
print(porche.color)
print(porche.take_off())

๐Ÿ”น console

Car with no roof
green
taking off

# โšก 4. 2020 BONUS CLASS

# 4.0 Welcome to 2020 Update

Flask : ํŒŒ์ด์ฌ์œผ๋กœ ์›น ์‚ฌ์ดํŠธ๋ฅผ ๋งŒ๋“ค ์ˆ˜ ์žˆ๊ฒŒ ํ•ด์ฃผ๋Š” micro-framework

Flask๋ฅผ ํ™œ์šฉํ•˜์—ฌ scrapper๋ฅผ ์›น์„œ๋ฒ„์— ์˜ฌ๋ฆด ์˜ˆ์ • !

# 4.1 Introduction to Flask

replit ํŒจํ‚ค์ง€์—์„œ Flask๋ฅผ ๊ฒ€์ƒ‰ ํ›„ installํ•ด์ค€๋‹ค.

๐Ÿ”ธ py

from flask import Flask

app = Flask("SuperScrapper")

app.run(host="0.0.0.0")

๐Ÿ”น web

vuepress

์ฝ”๋“œ๊ฐ€ ๋ฐ”๋€”๋•Œ๋งˆ๋‹ค ๋‹ค์‹œ runํ•ด์ค€๋‹ค.

๐Ÿ”ธ py

from flask import Flask

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return "Hello! Welcome to mi casa!"

@app.route("/contact")
def potato(): # ์ด๋ฆ„ ๋‹ฌ๋ผ๋„ ๋จ
  return "Contact me!"
  
app.run(host="0.0.0.0")

๐Ÿ”น web

vuepress
vuepress

# 4.2 Dynamic URLs and Templates

๐Ÿ”ธ py

from flask import Flask

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return "Hello! Welcome to mi casa!"

# placeholder๋ฅผ ๋ฐ›์œผ๋ฉด ์‚ฌ์šฉํ•ด์ค˜์•ผ ํ•œ๋‹ค. ์•ˆ๊ทธ๋Ÿผ error๋ฐœ์ƒ
# TypeError: potato() got an unexpected keyword argument 'username'
@app.route("/<username>")
def potato(username):
  return f"Hello your name is {username}"
  
app.run(host="0.0.0.0")

๐Ÿ”น web

vuepress

templates ํด๋” ์ƒ์„ฑ ํ›„, htmlํŒŒ์ผ์„ ํ•˜๋‚˜ ๋งŒ๋“ค์–ด์ฃผ์ž.

๐Ÿ”ธ potato.html

<!DOCTYPE html>
<html>
  <head>
    <title>Job Search</title>
  </head>
  <body>
    <h1>Job Search</h1>
    <form>
      <input placeholder='Search for a job' required/>
      <button>Search</button>
    </form>
  </body>
</html>

๐Ÿ”ธ py

from flask import Flask, render_template

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return render_template("potato.html")


app.run(host="0.0.0.0")

๐Ÿ”น web

vuepress

์ด๋•Œ, ๊ฒฝ๋กœ๋ฅผ ์ฃผ์ง€ ์•Š์•„๋„ htmlํŒŒ์ผ์„ ์ฐพ์•„์„œ ๋„์›Œ์ฃผ๋Š”๋ฐ, flask๊ฐ€ ์ €์ ˆ๋กœ ์ฐพ์•„์ค˜์„œ ๋œฐ ์ˆ˜ ์žˆ๋Š” ๊ฑฐ๋‹ค! ์‹ ๊ธฐ..

# 4.3 Forms and Query Arguments

report.html ํ…œํ”Œ๋ฆฟ์„ ํ•˜๋‚˜ ๋” ๋งŒ๋“ค์–ด์ฃผ๊ณ , ๊ฒ€์ƒ‰๋ฒ„ํŠผ์„ ๋ˆ„๋ฅด๋ฉด ์•„๋ž˜์™€ ๊ฐ™์ด query๋ฅผ ๋ฐ›์•„์˜ค๋„๋ก ํ•ด์ค€๋‹ค.

๋ฐ›์€ ๊ฐ’์„ render_template์œผ๋กœ report.html์— ๋„˜๊ฒจ renderingํ•ด์ค€๋‹ค.

๐Ÿ”ธ report.html

<!DOCTYPE html>
<html>
  <head>
    <title>Job Search</title>
  </head>
  <body>
    <h1>Search Results</h1>
    <h3>You are looking for {{searchingBy}}</h3>
    <h4>{{color}}</h4>
    </form>
  </body>
</html>

๐Ÿ”ธ main.py

from flask import Flask, render_template, request

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word') # query ๊ฐ€์ ธ์˜ค๊ธฐ
  return render_template("report.html", searchingBy=word, color="RED") # ๊ฐ’์„ ๋„˜๊ฒจ์คŒ

app.run(host="0.0.0.0")

๐Ÿ”น web

vuepress
vuepress

# 4.4 Scrapper Integration

๊ฒ€์ƒ‰์–ด๊ฐ€ ๋„˜์–ด์˜ค๋ฉด ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝํ•˜๋„๋ก ์ˆ˜์ •ํ•˜์˜€๋‹ค.

๋˜ํ•œ ๊ฒ€์ƒ‰์–ด ์—†์ด /report url๋กœ๋งŒ ์ ‘๊ทผํ–ˆ์„ ๋•Œ๋Š” redirect๋˜๋„๋ก ๋ณ€๊ฒฝํ•˜์˜€๋‹ค.

๐Ÿ”ธ py

from flask import Flask, render_template, request, redirect

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html", searchingBy=word, color="RED")

app.run(host="0.0.0.0")

์ด์ „์— ๋งŒ๋“ค์–ด๋†“์€ scrapper (opens new window)๋ฅผ ์›น์‚ฌ์ดํŠธ repl์— ๋ณต๋ถ™ํ•ด๋ณด์ž. (so.py๋ฅผ ๊ฐ€์ ธ์™€ scrapper.py ํŒŒ์ผ๋กœ ๋ณต๋ถ™)

์›น์‚ฌ์ดํŠธ scrapper repl์—๋Š” requests์™€ beautifulsoup์ด ์—†์œผ๋ฏ€๋กœ ํŒจํ‚ค์ง€ install์„ ํ•ด์ฃผ์ž.

get_companies์—์„œ word๋ฅผ ๋ฐ›์„ ์ˆ˜ ์žˆ๊ฒŒ ํ•ด์ฃผ๊ณ , url๋„ ํ•ด๋‹น function์—์„œ ์ˆ˜์ •ํ•ด์„œ get_last_page๋กœ ๋„˜๊ฒจ์ค€๋‹ค. extract_companies๋„ url์„ ๋ฐ›์„ ์ˆ˜ ์žˆ๋„๋ก ๋งž์ถฐ์„œ ์ˆ˜์ •ํ•ด์ค€๋‹ค.

๐Ÿ”ธ scrapper.py

import requests
from bs4 import BeautifulSoup

# ๋งˆ์ง€๋ง‰ ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
def get_last_page(url):
  result = requests.get(url)
  soup = BeautifulSoup(result.text, "html.parser")
  pages = soup.find("div", {"class": "s-pagination"}).find_all("a")
  # ๋งˆ์ง€๋ง‰(-1)์€ next๋ฒ„ํŠผ์ด๋ฏ€๋กœ ๋งˆ์ง€๋ง‰์—์„œ 2๋ฒˆ์งธ๊ฑฐ(-2)๊ฐ€ last page
  # strip=True๋ฅผ ํ™œ์šฉํ•˜์—ฌ ์•ž๋’ค ๊ณต๋ฐฑ ์ž๋ฅด๊ธฐ
  last_page = pages[-2].get_text(strip=True) 
  return int(last_page)

def extract_company(html):
  content = html.find("div", {"class": "flex--item fl1 text mb0"})
  # company
  company = content.find("h2").find("a", {"class": "s-link"}).string

  location, industry = content.find_all("div", {"class": "flex--item fc-black-500 fs-body1"})
  # location
  location = location.get_text(strip=True)
  # industry
  industry = industry.get_text(strip=True)
  
  # link
  link = content.find("h2").find("a", {"class": "s-link"})['href']

  return {"company": company, "location": location, "industry": industry, "apply_link": f"https://stackoverflow.com{link}"}
  
  
# ํšŒ์‚ฌ ๊ฐ€์ ธ์˜ค๊ธฐ
def extract_companies(last_page, url):
  companies = []
  # last page์˜ ๊ฐœ์ˆ˜๋งŒํผ ๋ฐฐ์—ด ๋งŒ๋“ค์–ด์„œ for๋ฌธ ๋Œ๋ฆฌ๊ธฐ
  for page in range(last_page):
    print(f"Scrapping SO: Page: {page}")
    result = requests.get(f"{url}&pg={page + 1}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class": "-company"})
    for result in results:
      company = extract_company(result)
      companies.append(company)
  return companies
  
def get_companies(word):
  url =  f"https://stackoverflow.com/jobs/companies?q={word}"
  last_page = get_last_page(url)
  companies = extract_companies(last_page, url)
  return companies

๐Ÿ”ธ main.py

from flask import Flask, render_template, request, redirect
from scrapper import get_companies

app = Flask("SuperScrapper")

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
    comps = get_companies(word)
    print(comps)
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html", searchingBy=word, color="RED")

app.run(host="0.0.0.0")

์ด์ œ ๊ฒ€์ƒ‰์„ ํ•ด์ฃผ๋ฉด ํ•ด๋‹น ๋‹จ์–ด๋กœ scrappingํ•ด์ค€ ๊ฒฐ๊ณผ๊ฐ€ print๋œ๋‹ค.

๐Ÿ”น console

crapping SO: Page: 0
Scrapping SO: Page: 1
Scrapping SO: Page: 2
Scrapping SO: Page: 3
Scrapping SO: Page: 4
Scrapping SO: Page: 5
Scrapping SO: Page: 6
Scrapping SO: Page: 7
Scrapping SO: Page: 8
Scrapping SO: Page: 9
Scrapping SO: Page: 10
Scrapping SO: Page: 11
Scrapping SO: Page: 12
Scrapping SO: Page: 13
Scrapping SO: Page: 14
Scrapping SO: Page: 15
Scrapping SO: Page: 16
Scrapping SO: Page: 17
Scrapping SO: Page: 18
[{'company': 'Branding Brand', 'location': 'Pittsburgh', 'industry': 'eCommerce, Headless Technology, Mobile Development', 'apply_link': 'https://stackoverflow.com/jobs/companies/branding-brand?c=1Cp4WgLdIRYzS4I8&q=react'}, {'company': 'WBS Gruppe', 'location': 'Berlin', 'industry': 'Education, eLearning, Online-Coaching', 'apply_link': 'https://stackoverflow.com/jobs/companies/wbs-gruppe?c=L5ITLrEQ7vTvO9BC&q=react'}, {'company': 'Amaris.AI', 'location': 'Singapore', 'industry': 'Artificial Intelligence, Consulting, Cybersecurity', 'apply_link': 'https://stackoverflow.com/jobs/companies/amarisai__suspended?c=NnKFnOhHU7mvFjJC&q=react'}, {'company': 'Pragmateam', 'location': 'Sydney; Porto Alegre; Gold Coast', 'industry': 'Product Development, Software Development / Engineering', 'apply_link': 'https://stackoverflow.com/jobs/companies/pragmateam?c=KEeyy1hjI6DTPRE4&q=react'}, {'company': 'Night Market', 'location': 'New York; Los Angeles; Toronto', 'industry': 'Advertising Technology, Data & Analytics, Media', 'apply_link': 'https://stackoverflow.com/jobs/companies/night-market?c=ODry4n2QaXBorfwY&q=react'}, {'company': 'EVS llc', 'location': 'Westminster; Del Mar', 'industry': 'Inventory Management Software, Supply Chain Management Software, Warehouse Management Software (WMS)', 'apply_link': 'https://stackoverflow.com/jobs/companies/evs-llc?c=OA83Of1Loao4UtDq&q=react'}, {'company': 'Hubble Pte Ltd', 'location': 'Singapore', 'industry': '3D Models, Construction, Information Technology', 'apply_link': 'https://stackoverflow.com/jobs/companies/hubble-pte-ltd?c=OEw1V06PTVhZKYcE&q=react'}, {'company': 'Paradox Cat GmbH', 'location': 'Ingolstadt; Mรผnchen', 'industry': 'Automotive, Computer Graphics, Project Management', 'apply_link': 'https://stackoverflow.com/jobs/companies/paradox-cat-ltd?c=HVGSQCvPD9snKwlG&q=react'}, {'company': 'AMBOSS ', 'location': 'Kรถln; Berlin; New York', 'industry': 'Education Technology, Healthcare, Medical', 'apply_link': 'https:/
... ์ƒ๋žต

# 4.5 Faster Scrapper

fake db ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ๊ณ , ๊ฒ€์ƒ‰๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•œ๋‹ค.

์ด์ „์— ๊ฒ€์ƒ‰ํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์žˆ์œผ๋ฉด fake db์—์„œ ์ฐพ์•„์ฃผ๊ณ , ์—†์œผ๋ฉด ์ƒˆ๋กœ ๊ฒ€์ƒ‰ํ•˜์—ฌ fake db์— ๋„ฃ์–ด์ค€๋‹ค.

๐Ÿ”ธ main.py

from flask import Flask, render_template, request, redirect
from scrapper import get_companies

app = Flask("SuperScrapper")

# fake db
db = {}

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
    # db ์— ์žˆ๋Š”์ง€ ํ™•์ธ
    fromDb = db.get(word)
    if fromDb:
      comps = fromDb
    else:
      comps = get_companies(word)
      db[word] = comps
    print(jobs)
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html",
                         searchingBy=word,
                         resultsNumber=len(comps)
                        )

app.run(host="0.0.0.0")

๐Ÿ”ธ report.html

<!DOCTYPE html>
<html>
  <head>
    <title>Job Search</title>
  </head>
  <body>
    <h1>Search Results</h1>
    <h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
    </form>
  </body>
</html>

๐Ÿ”น web

vuepress

# 4.6 Rendering Jobs!

flask html์—์„œ python์„ ์ž‘์„ฑํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ์•„๋ž˜์™€ ๊ฐ™์ด ํ•˜๋ฉด ๋œ๋‹ค.

css grid๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๊ฐ€์ ธ์˜จ companies๋ฅผ ํ…Œ์ด๋ธ” ํ˜•์‹์œผ๋กœ ๋ฟŒ๋ ค์ฃผ์—ˆ๋‹ค.

๐Ÿ”ธ main.py

from flask import Flask, render_template, request, redirect
from scrapper import get_companies

app = Flask("SuperScrapper")

# fake db
db = {}

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
    # db ์— ์žˆ๋Š”์ง€ ํ™•์ธ
    existingComps = db.get(word)
    if existingComps:
      comps = existingComps
    else:
      comps = get_companies(word)
      db[word] = comps
    print(comps)
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html",
                         searchingBy=word,
                         resultsNumber=len(comps),
                         comps=comps
                        )

app.run(host="0.0.0.0")

๐Ÿ”ธ report.html

<!DOCTYPE html>
<html>
  <head>
    <title>Job Search</title>
    <style>
      section {
        display: grid;
        gap: 20px;
        grid-template-columns: repeat(4, 1fr);
      }
    </style>
  </head>
  <body>
    <h1>Search Results</h1>
    <h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
    <section>
      <h4>company</h4>
      <h4>location</h4>
      <h4>industry</h4>
      <h4>apply_link</h4>
      {% for comp in comps%}
        <span>{{comp.company}}</span>
        <span>{{comp.location}}</span>
        <span>{{comp.industry}}</span>
        <a href="{{comp["apply_link"]}}">Apply</a>
      {% endfor %}
    </section>
    </form>
  </body>
</html>

๐Ÿ”น web

vuepress

# 4.7 Export Route

๋ฒ„ํŠผ์„ ๋งŒ๋“ค์–ด์„œ csv๋กœ export ํ•ด๋ณด์ž.

๐Ÿ”ธ main.py

from flask import Flask, render_template, request, redirect
from scrapper import get_companies

app = Flask("SuperScrapper")

# fake db
db = {}

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
    # db ์— ์žˆ๋Š”์ง€ ํ™•์ธ
    existingComps = db.get(word)
    if existingComps:
      comps = existingComps
    else:
      comps = get_companies(word)
      db[word] = comps
    print(comps)
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html",
                         searchingBy=word,
                         resultsNumber=len(comps),
                         comps=comps
                        )

@app.route("/export")
def export():
  # try exception ์‚ฌ์šฉ (exception ๋ฐœ์ƒํ•˜๋ฉด except๋กœ ๊ฐ)
  try:
    word = request.args.get('word') # ๊ฒ€์ƒ‰์–ด ์—†์œผ๋ฉด ์—๋Ÿฌ
    if not word:
      raise Exception()
    word = word.lower()
    comps = db.get(word)
    if not comps: #db์— ์—†์œผ๋ฉด ์—๋Ÿฌ
      raise Exception()
    return f"Genearte CSV for {word}"
  except:
    return redirect('/')
    
app.run(host="0.0.0.0")

๐Ÿ”ธ report.html

<!DOCTYPE html>
<html>
  <head>
    <title>Job Search</title>
    <style>
      section {
        display: grid;
        gap: 20px;
        grid-template-columns: repeat(4, 1fr);
      }
    </style>
  </head>
  <body>
    <h1>Search Results</h1>
    <h3>Found {{resultsNumber}} results for: {{searchingBy}}</h3>
    <a href="/export?word={{searchingBy}}">Export to CSV</a>
    <section>
      <h4>company</h4>
      <h4>location</h4>
      <h4>industry</h4>
      <h4>apply_link</h4>
      {% for comp in comps%}
        <span>{{comp.company}}</span>
        <span>{{comp.location}}</span>
        <span>{{comp.industry}}</span>
        <a href="{{comp["apply_link"]}}">Apply</a>
      {% endfor %}
    </section>
    </form>
  </body>
</html>

๐Ÿ”น web

vuepress
vuepress

# 4.8 File Download

์ด์ „ scrapper (opens new window) ์—์„œ save.py๋ฅผ ๊ฐ€์ ธ์˜จ๋‹ค.

exporter.py๋ฅผ ์ƒˆ๋กœ ๋งŒ๋“ค์–ด์„œ ๋ณต๋ถ™ํ•ด์ฃผ์ž.

csvํŒŒ์ผ์„ ๋งŒ๋“ค๊ณ  send_file์„ ์‚ฌ์šฉํ•˜์—ฌ ๋‹ค์šด๋กœ๋“œ๊นŒ์ง€ ํ•ด์ฃผ์ž.

๐Ÿ”ธ main.py

from flask import Flask, render_template, request, redirect, send_file
from scrapper import get_companies
from exporter import save_to_file_companies

app = Flask("SuperScrapper")

# fake db
db = {}

@app.route("/")
def home():
  return render_template("potato.html")

@app.route("/report")
def report():
  word = request.args.get('word')
  if word: # word๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ ์†Œ๋ฌธ์ž๋กœ ๋ณ€๊ฒฝ
    word = word.lower()
    # db ์— ์žˆ๋Š”์ง€ ํ™•์ธ
    existingComps = db.get(word)
    if existingComps:
      comps = existingComps
    else:
      comps = get_companies(word)
      db[word] = comps
    print(comps)
  else: # ์—†๋Š” ๊ฒฝ์šฐ ํ™ˆ์œผ๋กœ redirect
    return redirect("/")
  return render_template("report.html",
                         searchingBy=word,
                         resultsNumber=len(comps),
                         comps=comps
                        )

@app.route("/export")
def export():
  # try exception ์‚ฌ์šฉ (exception ๋ฐœ์ƒํ•˜๋ฉด except๋กœ ๊ฐ)
  try:
    word = request.args.get('word') # ๊ฒ€์ƒ‰์–ด ์—†์œผ๋ฉด ์—๋Ÿฌ
    if not word:
      raise Exception()
    word = word.lower()
    comps = db.get(word)
    if not comps: #db์— ์—†์œผ๋ฉด ์—๋Ÿฌ
      raise Exception()
    save_to_file_companies(comps)
    return send_file("companies.csv")
  except:
    return redirect('/')
    
app.run(host="0.0.0.0")

๐Ÿ”ธ export.py

import csv

def save_to_file_companies(companies):
  file = open("companies.csv", mode="w")
  writer = csv.writer(file)
  # ํ—ค๋”์ค„ ์ƒ์„ฑ
  writer.writerow(["company", "location", "industry", "apply_link"])
  for company in companies:
    # dict์—์„œ values๋งŒ ๊ฐ€์ ธ์˜ค๋ฉด dict_values๊ฐ€ type์ž„
    # ๋”ฐ๋ผ์„œ list๋กœ cast ํ•ด์ค€๋‹ค
    writer.writerow(list(company.values()))
  return 

๊ฒ€์ƒ‰ ํ›„, export ํ•ด์ฃผ๋ฉด ํŒŒ์ผ์ด ๋‹ค์šด๋กœ๋“œ ๋˜๋Š” ๊ฑธ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.

๐Ÿ”น csv

vuepress

# Reference

Pythone์œผ๋กœ ์›น ์Šคํฌ๋ž˜ํผ ๋งŒ๋“ค๊ธฐ (opens new window)
Python library (opens new window)
python-scrapper-replit (opens new window)
python-super-scrapper-replit (opens new window)

Last Updated: 3/8/2024, 5:46:31 AM