본문으로 바로가기

[PHP] CURL을 이용한 RSS 파서

category 프로그램/PHP 2015/09/13 05:33

[PHP] CURL을 이용한 RSS 파서

RSS 2.0, RSS 1.0, ATOM 을 대용하기 위해서 한개의 파일로 만들려고 노력을 하였지만 잘 되지 않았습니다.



<?php
class RssParser 
{
        var $Agent = "MyAgent";
        var $CookieNM = "./cookie.txt";
        var $debug = false;
 
         
        function RssParser() {
             
        }
         
         
        function RssGet ($RssURL, $RssParam='') 
        {
            $this->URL = $RssURL;
            $this->Param = $RssParam;
             
            $this->GetHeader();
            $this->RssGetAccess();
             
            $this->RssXMLLoad();
 
            // xml 중 테이터가 있을 경우에만 결과값을 돌려줌
            if(empty($this->xml->channel->title) == false) 
            {
                return $this->MyParser();
            }
        }
 
         
        function RssPost ($RssURL, $RssData) 
        {
            $this->URL = $RssURL;
            $this->Data = $RssData;
            $this->ContentLength = strlen($this->Data);
        }
         
         
        function GetHeader() 
        {
            //"GET ".$this->Param." HTTP/1.1",
            $this->Headers = array(    
                "Content-type: application/xml;charset=\"utf-8\"",
                "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language: ko-kr,ko;q=0.8,en-us;q=0.5,en;q=0.3",
                "Accept-Encoding: ",
                "Accept-Charset: EUC-KR,utf-8;q=0.7,*;q=0.7",
                "Keep-Alive: 300",
                "Connection: keep-alive",
                ""
            );
        }
         
         
        /**
         * 실제 데이터 얻어 오는 부분
         **/
        function RssGetAccess() 
        {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_DNS_USE_GLOBAL_CACHE, false);
            curl_setopt($ch, CURLOPT_URL, $this->URL); 
            curl_setopt($ch, CURLOPT_HTTPHEADER, $this->Headers);
            curl_setopt($ch, CURLOPT_REFERER, 'http://www.naver.com');
            curl_setopt($ch, CURLOPT_HEADER, false);    // 헤더 출력 여부
            curl_setopt($ch, CURLOPT_USERAGENT, $this->Agent);
            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);    // 인증서 체크같은데 true 시 안되는 경우가 많다.
            curl_setopt($ch, CURLOPT_SSLVERSION,3); // SSL 버젼 (https 접속시에 필요)
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
            curl_setopt($ch, CURLOPT_TIMEOUT, 30);  // TimeOut 값
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);        // 결과값을 받을것인지   
             
            $this->Result = curl_exec($ch);
            $this->Status = curl_getinfo($ch, CURLINFO_HTTP_CODE);
             
            if(empty($this->debug) == false) 
           {
                echo chr(10)."Result : ".$this->Result."\r\n" . chr(10)."\r\n" . chr(10);
                echo chr(10)."Status : ".chr(10).$this->Status . chr(10) ."\r\n" . chr(10);
            }
             
            curl_close($ch);   
        }
 
         
        /**
         * 수집한 Feed중 삭제된것은 삭제하기 위한 함수
         **/
        function RssStatus($RssURL) 
        {             
            $this->GetHeader();
             
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_DNS_USE_GLOBAL_CACHE, false);
            curl_setopt($ch, CURLOPT_URL, $FeedURL); 
            curl_setopt($ch, CURLOPT_HTTPHEADER, $this->Headers);
            curl_setopt($ch, CURLOPT_REFERER, 'http://www.google.com');
            curl_setopt($ch, CURLOPT_USERAGENT, $this->Agent);
            curl_setopt($ch, CURLOPT_TIMEOUT, 30);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);   
             
            $HTTP_RESULT = curl_exec($ch);
            $HTTP_STATUS = curl_getinfo($ch, CURLINFO_HTTP_CODE);
            curl_close($ch);
            
            return $HTTP_STATUS;
        }
 
         
        /**
         * PHP 내장 XML 모듈로 처리
         **/
        function RssXMLLoad() 
        { 
            // cURL의 데이터가 넘어오지 않으면 실행하지 않음
            if(empty($this->Result) == false) 
            {
                $this->xml = @simplexml_load_string($this->Result, 'SimpleXMLElement', LIBXML_NOCDATA);
            }
        }
         
         
        /**
         * 서비스형에 따라서 값이 다른것을 통일하기 위함
         **/
        function MyParser() 
       {             
            $Rss = new stdClass();
            $Rss->channel = new stdClass();
            $Rss->image = new stdClass();
            $Rss->item = new stdClass();
             
            $Rss->channel->title = $this->xml->channel->title;
            $Rss->channel->link = $this->xml->channel->link;
            $Rss->channel->author = $this->OwnerWriteInfo();
            $Rss->channel->description = $this->xml->channel->description;
            $Rss->channel->pubDate = $this->OwnerDateInfo();
            $Rss->channel->language = $this->xml->channel->language;
             
            $Rss->image->url = $this->xml->channel->image->url;
            $Rss->image->description = $this->xml->channel->image->description;
            //var_dump($this->xml->channel->item);
             
            //  RSS 1.0 대응
            unset($tmpItem);
            $tmpItem = $this->xml->channel->item;
            if( empty($tmpItem) == true) 
           {
                $tmpItem = $this->xml->item;
            }
 
            $i=0;
            foreach($tmpItem as $ItemData) 
            {
                foreach($ItemData as $ItemKey=>$ItemValue) 
                 {
                     
                    // 반복문이나 조건문 처리 하기
                    switch($ItemKey) {
                        case "category" :
                            unset($CatagoryMerge);
                            foreach($ItemData->{$ItemKey} as $CateData) 
                             {
                                 
                                if(empty($CatagoryMerge) == true) 
                                {
                                    $Tmp[$i][$ItemKey] = $CateData;
                                    $CatagoryMerge = true;
                                } else {
                                    $Tmp[$i][$ItemKey] .= ", ".$CateData;
                                }                               
                                 
                            }
                        break;
                         
                        case "pubDate" :
                            $Tmp[$i][$ItemKey] = strtotime($ItemData->{$ItemKey});
                        break;
                         
                        default :
                            $Tmp[$i][$ItemKey] = $ItemData->{$ItemKey};
                        break;
                    }
 
                    // switch 문으로 하지 못하는것은 뒤에서 처리
                    // RSS 1.0 대응
                    $dc = $ItemData->children('dc', true);
                    if(empty($dc) == false) 
                     {
                        $Tmp[$i]['author'] = $dc->creator;
                        if(empty($Tmp[$i]['pubDate']) == true) 
                         {
                            $Tmp[$i]['pubDate'] = strtotime($dc->date);
                        }
                    }
                }
                 
                $i++;
            } // end foreach
             
            $Rss->item = $Tmp;

            return $Rss;
        }
     
         
        /**
         * 글 작성자 정보
         * RSS 2.0
         * RSS 1.0
         **/
          
        function OwnerWriteInfo() 
          {    
            unset($AuthorInfo);
             
            if(empty($this->xml->channel->item[0]->author) == false && empty($AuthorInfo) == true) 
             {
                $AuthorInfo = $this->xml->channel->item[0]->author;
            }
 
            if(empty($this->xml->channel->managingEditor) == false && empty($AuthorInfo) == true) 
             {
                $AuthorInfo = $this->xml->channel->managingEditor;
            }
             
            // RSS 1.0 대응
            if(empty($AuthorInfo) == true) 
           {
                //$dc = $this->xml->channel->item[0]->children('dc', true);
                $AuthorInfo = $dc->creator;   
            }

            return $AuthorInfo;
         }


        /**
         * 발행 날짜
         * RSS 2.0
         * RSS 1.0
         **/
        function OwnerDateInfo() 
         {    
            unset($DataInfo);
             
            // 블로그의 최종 글 발행일은 이것으로 한번 테스트
            $DataInfo = $this->xml->channel->item[0]->pubDate;
             
            // RSS 발행일
            // naver.com RSS의 XML에 접근하는 시간으로 표현함
            if(empty($DataInfo) == true) 
             {
                $DataInfo = $this->xml->channel->pubDate;
            }
 
            if(empty($DataInfo) == true) 
             {
                $DataInfo = $this->xml->channel->lastBuildDate;
            }
             
            // RSS 1.0 대응
            if(empty($DataInfo) == true) 
            {
                //$dc = $this->xml->channel->item[0]->children('dc', true);
                $DataInfo = $dc->date;
            }
 
            return strtotime($DataInfo);
        }


        /**
         * URL에서 도메인 추출
         * @ 일단 사용하지 않음.
         **/
        function DetectDomain ($AllUrl) 
        {
            $Pattern  = "/^([a-z:\/\/]*[^\/?]*)([^$]*)/";  // 핵심은 처음나오는 / 나 ? 가 나올 경우로 추출.^^
            preg_match($Pattern, $AllUrl, $result);
             
            $return = preg_replace("/http\:\/\//", "", $result[1]);
             
            return $return;
        }
 
         
}
?>



크리에이티브 커먼즈 라이센스
Creative Commons License

트랙백

http://trudy.kr/trackback/3

댓글을 달아 주세요

Trudy
Trudy 님의 블로그
MENU
VISITOR 오늘330 / 전체442792