본문 바로가기
Programming/python

[Python, web scrapper] 데이터 수집 연습

by 그렉그의 2023. 3. 3.

MAIN PAGE

from extractors.jobkr import extract_jobkr_jobs
from extractors.wwr import extract_wwr_jobs

keyword = input("what job are you searching for? ")

# because these are list, because it returns a list, I can combine both of them
jobkr = extract_jobkr_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = jobkr + wwr

# csv 파일 열기 시작
file = open(f"{keyword}.csv", "w", encoding="utf-8")
# comma separated value 이기 때문에, 그리고 마지막에 \n 적어줘야 함
file.write("Company, Position, location, link\n")

# because we are combining two lists (jobkr+wwr- 이거 두개 다 리스트임), we need to do a for loop
# remember that both of your extractors are using same keyname(job_data) DICTIONARY to save the extracted content
for job in jobs:
  # job(job_data)는 dictionary 이기 때문에 아래에서도 {Job}를 적어줘야 함
  # 딕셔너리에 저장되어있는 값을
       file.write(
        f"{job['company']},{job['position']},{job['location']},{job['link']}\n")


    
file.close()

 

JOBKR PAGE

from requests import get
from bs4 import BeautifulSoup

# from extractors.wwr import extract_wwr_jobs
# 타 폴더에서 불러옴
# from extractors.jobkr import extract_jobkr_jobs


# need to request the first page
def get_page_count(keyword):

  # request the first page of the search
  base_url = "https://www.jobkorea.co.kr/Search/?stext="
  response = get(f"{base_url}+{keyword}")
  if response.status_code != 200:
    print("cant request")
    # create the soup of the first page
  else:
    soup = BeautifulSoup(response.text, "html.parser")
    # 직업 대신에 페이지 번호를 soup에서 생성함
    # 1개만 찾을 때 find, 그 모든 recursive 한거 찾을 때는 find_all
    pagination_div = soup.find("div", class_="tplPagination")
    pagination = pagination_div.find("ul")
    if pagination == None:
      return 1
    pages = pagination.find_all("li")
    count = len(pages)
    #페이지가 10개보다 많다면, 10개만 return 해줘!
    if count >= 10:
      return 10
    else:
      return count


def extract_jobkr_jobs(keyword):
  #scrap 할 페이지 갯수를 찾기 위해서 아래와 같이 작성
  # range creates sequence of number from
  pages = get_page_count(keyword)
  print("found", pages, "pages")
  # for 밖으로 빼줘야 됨
  results = []

  for page in range(pages):
    # 처음 seach 할 시 첫번째 페이지에서 search를 수행해야 함
    # 그 다음엔 각각의 페이지에 아래 함수를 계속 요청할 것임
    base_url = "https://www.jobkorea.co.kr/Search/?stext="
    final_url = f"{base_url}+{keyword}&tabType=recruit&Page_No={page+1}"
    print(final_url)
    response = get(final_url)

    if response.status_code != 200:
      print("cant request")
    else:
      soup = BeautifulSoup(response.text, "html.parser")
      job_list = soup.find("div", class_="list-default")
      jobs = job_list.find("ul", class_="clear")
      # recursive tag를 찾을 때에는 find_all을 하면 된다.
      job = jobs.find_all("li", class_="list-post")
      for job_section in job:
        post = job_section.find_all("div", class_="post")

        for job_section in post:
          post_list = job_section.find("div", class_="post-list-corp")

          # company name
          company_name = post_list.find("a")
          company = company_name["title"]
          # link
          link = company_name["href"]
          # company position
          post_list_info = job_section.find("div", class_="post-list-info")
          title_name = post_list_info.find("a")
          title = title_name["title"]
          # location
          location_name = post_list_info.find("p", class_="option")
          location = location_name.find("span", class_="loc long")
          # position
          # position = location_name.find("span", class_="exp")
          job_data = {
            "company": company.replace(",", " "),
            "position": title.replace(",", " "),
            "location": location.string.replace(",", " "),
            "link": f"https://www.jobkorea.co.kr/Search/?stext=+{link}",
            # "title": title.string
          }

          results.append(job_data)
          # getting value out of the function
  # should always return the results
  return results

 

WWR PAGE

from requests import get
from bs4 import BeautifulSoup


def extract_wwr_jobs(keyword):
  base_url = "https://weworkremotely.com/remote-jobs/search?utf8=%E2%9C%93&term=python"
  response = get(f"{base_url}+{keyword}")

  if response.status_code != 200:
    print("cant request")
  else:
    results = []
    # 선정 웹페이지에서 모든 html을 다 긁어옴
    soup = BeautifulSoup(response.text, "html.parser")
    # 모든 html에서 job html Text의 section만 가져오는 분류 작업 시작
    jobs = soup.find_all("section", class_="jobs")
    # 위에서 섹션만 정리한 리스트 안에서 이제 상세한 추려내기 시작
    for job_section in jobs:
      # 추려낸 섹션 html에서 li 만 뽑아내기
      job_post = job_section.find_all("li")
      # 뽑아낸 리스트에서 마지막꺼는 필요없으니까 빼버리기
      job_post.pop(-1)
      # 뽑아낸 li 리스트에서 이제 앵커 뽑아내기
      for post in job_post:
        anchors = post.find_all("a")
        anchor = anchors[1]
        link = anchor["href"]
        company, position, location = anchor.find_all("span", class_="company")
        # 각 명칭을 정해주고 그 태그만 가져옴
        title = anchor.find("span", class_="title")
        # make a dictionary based on what we have created
        # `find_all` returns a list, lists don't have .string.
        job_data = {
          "company": company.string.replace(",", " "),
          # "position": position.string.replace(",", " "),
          "position": title.string.replace(",", " "),
          "location": location.string.replace(",", " "),
          "link": f"https://weworkremotely.com+{link}"
        }
        results.append(job_data)

  for a in results:
    print(results)
    return results

'Programming > python' 카테고리의 다른 글

[python] Day 26  (0) 2023.03.05
[python] 데이터 스크랩 연습  (0) 2023.03.04
[Python] Beautifulsoup 을 사용해서 웹사이트 정보 가져오는 연습  (0) 2023.03.02
[python] Day 23  (1) 2023.03.01
[python] 파이썬 연습  (0) 2023.02.28