2014-03-14 00:36:14 +07:00
from __future__ import unicode_literals
2016-03-26 23:42:34 +06:00
import re
2014-03-14 00:36:14 +07:00
from . common import InfoExtractor
2014-12-13 12:24:42 +01:00
from . . compat import (
2015-12-02 00:48:27 +06:00
compat_HTTPError ,
2018-02-26 04:12:28 +07:00
compat_kwargs ,
2016-09-22 21:48:53 +07:00
compat_str ,
2014-03-14 00:36:14 +07:00
compat_urllib_request ,
2016-03-25 02:26:46 +06:00
compat_urlparse ,
2014-12-13 12:24:42 +01:00
)
from . . utils import (
2016-03-26 23:42:34 +06:00
determine_ext ,
extract_attributes ,
2014-03-14 00:36:14 +07:00
ExtractorError ,
2015-12-02 00:48:27 +06:00
float_or_none ,
2015-12-01 20:35:46 +06:00
int_or_none ,
2017-08-04 23:44:07 +07:00
js_to_json ,
2015-11-21 22:18:17 +06:00
sanitized_Request ,
2016-01-06 00:02:21 +06:00
unescapeHTML ,
2016-03-26 02:19:24 +06:00
urlencode_postdata ,
2014-03-14 00:36:14 +07:00
)
class UdemyIE ( InfoExtractor ) :
IE_NAME = ' udemy '
2016-03-25 02:28:39 +06:00
_VALID_URL = r ''' (?x)
https ? : / /
www \. udemy \. com /
( ? :
[ ^ #]+\#/lecture/|
lecture / view / ? \? lectureId = |
[ ^ / ] + / learn / v4 / t / lecture /
)
( ? P < id > \d + )
'''
2015-07-14 22:39:41 +06:00
_LOGIN_URL = ' https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1 '
_ORIGIN_URL = ' https://www.udemy.com '
2014-03-14 00:36:14 +07:00
_NETRC_MACHINE = ' udemy '
2014-03-16 07:09:10 +01:00
_TESTS = [ {
2014-03-14 00:36:14 +07:00
' url ' : ' https://www.udemy.com/java-tutorial/#/lecture/172757 ' ,
' md5 ' : ' 98eda5b657e752cf945d8445e261b5c5 ' ,
' info_dict ' : {
' id ' : ' 160614 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Introduction and Installation ' ,
' description ' : ' md5:c0d51f6f21ef4ec65f091055a5eef876 ' ,
' duration ' : 579.29 ,
} ,
' skip ' : ' Requires udemy account credentials ' ,
2016-03-25 02:28:39 +06:00
} , {
# new URL schema
' url ' : ' https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906 ' ,
' only_matching ' : True ,
2017-05-25 22:28:26 +07:00
} , {
# no url in outputs format entry
' url ' : ' https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812 ' ,
' only_matching ' : True ,
2018-05-01 02:15:43 +07:00
} , {
# only outputs rendition
' url ' : ' https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0 ' ,
' only_matching ' : True ,
2014-03-16 07:09:10 +01:00
} ]
2014-03-14 00:36:14 +07:00
2016-03-31 22:05:25 +06:00
def _extract_course_info ( self , webpage , video_id ) :
course = self . _parse_json (
unescapeHTML ( self . _search_regex (
2017-12-09 19:52:31 +07:00
r ' ng-init=[ " \' ].* \ bcourse=( { .+?})[; " \' ] ' ,
webpage , ' course ' , default = ' {} ' ) ) ,
2016-03-31 22:05:25 +06:00
video_id , fatal = False ) or { }
course_id = course . get ( ' id ' ) or self . _search_regex (
2017-12-09 19:52:31 +07:00
r ' data-course-id=[ " \' ]( \ d+) ' , webpage , ' course id ' )
2016-03-31 22:05:25 +06:00
return course_id , course . get ( ' title ' )
2016-03-25 02:26:46 +06:00
def _enroll_course ( self , base_url , webpage , course_id ) :
2016-03-27 00:09:12 +06:00
def combine_url ( base_url , url ) :
return compat_urlparse . urljoin ( base_url , url ) if not url . startswith ( ' http ' ) else url
2016-01-06 00:02:21 +06:00
checkout_url = unescapeHTML ( self . _search_regex (
2017-08-17 23:14:46 +07:00
r ' href=([ " \' ])(?P<url>(?:https?://(?:www \ .)?udemy \ .com)?/(?:payment|cart)/checkout/.+?) \ 1 ' ,
2016-01-06 00:02:21 +06:00
webpage , ' checkout url ' , group = ' url ' , default = None ) )
if checkout_url :
raise ExtractorError (
2016-01-06 00:03:39 +06:00
' Course %s is not free. You have to pay for it before you can download. '
2016-03-27 00:09:12 +06:00
' Use this URL to confirm purchase: %s '
% ( course_id , combine_url ( base_url , checkout_url ) ) ,
expected = True )
2016-01-06 00:02:21 +06:00
enroll_url = unescapeHTML ( self . _search_regex (
2016-03-25 02:26:46 +06:00
r ' href=([ " \' ])(?P<url>(?:https?://(?:www \ .)?udemy \ .com)?/course/subscribe/.+?) \ 1 ' ,
2016-01-06 00:02:21 +06:00
webpage , ' enroll url ' , group = ' url ' , default = None ) )
if enroll_url :
2016-03-27 00:09:12 +06:00
webpage = self . _download_webpage (
combine_url ( base_url , enroll_url ) ,
2016-05-04 23:03:44 +06:00
course_id , ' Enrolling in the course ' ,
headers = { ' Referer ' : base_url } )
2016-01-06 00:02:21 +06:00
if ' >You have enrolled in ' in webpage :
self . to_screen ( ' %s : Successfully enrolled in the course ' % course_id )
2015-12-02 00:48:27 +06:00
def _download_lecture ( self , course_id , lecture_id ) :
return self . _download_json (
2016-05-04 23:14:12 +06:00
' https://www.udemy.com/api-2.0/users/me/subscribed-courses/ %s /lectures/ %s ? '
% ( course_id , lecture_id ) ,
lecture_id , ' Downloading lecture JSON ' , query = {
' fields[lecture] ' : ' title,description,view_html,asset ' ,
' fields[asset] ' : ' asset_type,stream_url,thumbnail_url,download_urls,data ' ,
} )
2015-12-02 00:48:27 +06:00
2014-03-14 00:36:14 +07:00
def _handle_error ( self , response ) :
if not isinstance ( response , dict ) :
return
error = response . get ( ' error ' )
if error :
error_str = ' Udemy returned error # %s : %s ' % ( error . get ( ' code ' ) , error . get ( ' message ' ) )
error_data = error . get ( ' data ' )
if error_data :
error_str + = ' - %s ' % error_data . get ( ' formErrors ' )
raise ExtractorError ( error_str , expected = True )
2018-04-29 22:49:47 +07:00
def _download_webpage_handle ( self , * args , * * kwargs ) :
2018-02-26 04:12:28 +07:00
kwargs . setdefault ( ' headers ' , { } ) [ ' User-Agent ' ] = ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4 '
2018-04-29 22:49:47 +07:00
return super ( UdemyIE , self ) . _download_webpage_handle (
2018-02-26 04:12:28 +07:00
* args , * * compat_kwargs ( kwargs ) )
2016-03-31 22:05:25 +06:00
def _download_json ( self , url_or_request , * args , * * kwargs ) :
2014-11-26 21:00:18 +06:00
headers = {
' X-Udemy-Snail-Case ' : ' true ' ,
' X-Requested-With ' : ' XMLHttpRequest ' ,
}
for cookie in self . _downloader . cookiejar :
if cookie . name == ' client_id ' :
headers [ ' X-Udemy-Client-Id ' ] = cookie . value
elif cookie . name == ' access_token ' :
headers [ ' X-Udemy-Bearer-Token ' ] = cookie . value
2015-12-02 00:48:27 +06:00
headers [ ' X-Udemy-Authorization ' ] = ' Bearer %s ' % cookie . value
2014-11-26 21:25:43 +06:00
if isinstance ( url_or_request , compat_urllib_request . Request ) :
for header , value in headers . items ( ) :
url_or_request . add_header ( header , value )
else :
2015-11-21 22:18:17 +06:00
url_or_request = sanitized_Request ( url_or_request , headers = headers )
2014-11-26 21:25:43 +06:00
2016-03-31 22:05:25 +06:00
response = super ( UdemyIE , self ) . _download_json ( url_or_request , * args , * * kwargs )
2014-11-26 21:25:43 +06:00
self . _handle_error ( response )
return response
2014-11-26 21:00:18 +06:00
2014-03-14 00:36:14 +07:00
def _real_initialize ( self ) :
self . _login ( )
def _login ( self ) :
( username , password ) = self . _get_login_info ( )
if username is None :
2015-12-01 22:10:10 +06:00
return
2014-03-14 00:36:14 +07:00
login_popup = self . _download_webpage (
2015-07-14 22:39:41 +06:00
self . _LOGIN_URL , None , ' Downloading login popup ' )
2014-03-14 00:36:14 +07:00
2015-07-22 22:49:00 +06:00
def is_logged ( webpage ) :
2016-05-28 21:18:24 +06:00
return any ( re . search ( p , webpage ) for p in (
r ' href=[ " \' ](?:https://www \ .udemy \ .com)?/user/logout/ ' ,
r ' >Logout< ' ) )
2015-07-22 22:49:00 +06:00
# already logged in
if is_logged ( login_popup ) :
2014-03-14 00:36:14 +07:00
return
2015-07-14 22:39:41 +06:00
login_form = self . _form_hidden_inputs ( ' login-form ' , login_popup )
2014-03-14 00:36:14 +07:00
2015-07-14 22:39:41 +06:00
login_form . update ( {
2016-04-01 22:42:09 +06:00
' email ' : username ,
' password ' : password ,
2015-07-14 22:39:41 +06:00
} )
response = self . _download_webpage (
2017-11-11 20:49:03 +07:00
self . _LOGIN_URL , None , ' Logging in ' ,
2016-05-04 23:14:12 +06:00
data = urlencode_postdata ( login_form ) ,
headers = {
' Referer ' : self . _ORIGIN_URL ,
' Origin ' : self . _ORIGIN_URL ,
} )
2014-03-14 00:36:14 +07:00
2015-07-22 22:49:00 +06:00
if not is_logged ( response ) :
2015-07-14 22:39:41 +06:00
error = self . _html_search_regex (
r ' (?s)<div[^>]+class= " form-errors[^ " ]* " >(.+?)</div> ' ,
response , ' error message ' , default = None )
if error :
raise ExtractorError ( ' Unable to login: %s ' % error , expected = True )
2014-03-14 00:36:14 +07:00
raise ExtractorError ( ' Unable to log in ' )
def _real_extract ( self , url ) :
2014-12-04 08:27:40 +01:00
lecture_id = self . _match_id ( url )
2014-03-14 00:36:14 +07:00
2015-12-02 00:48:27 +06:00
webpage = self . _download_webpage ( url , lecture_id )
2016-03-31 22:05:25 +06:00
course_id , _ = self . _extract_course_info ( webpage , lecture_id )
2014-03-14 00:36:14 +07:00
2015-12-02 00:48:27 +06:00
try :
lecture = self . _download_lecture ( course_id , lecture_id )
except ExtractorError as e :
# Error could possibly mean we are not enrolled in the course
if isinstance ( e . cause , compat_HTTPError ) and e . cause . code == 403 :
2016-03-25 02:26:46 +06:00
self . _enroll_course ( url , webpage , course_id )
2015-12-31 01:09:21 +06:00
lecture = self . _download_lecture ( course_id , lecture_id )
2015-12-02 00:48:27 +06:00
else :
raise
title = lecture [ ' title ' ]
description = lecture . get ( ' description ' )
asset = lecture [ ' asset ' ]
2016-04-01 22:38:56 +06:00
asset_type = asset . get ( ' asset_type ' ) or asset . get ( ' assetType ' )
2014-11-26 21:00:18 +06:00
if asset_type != ' Video ' :
raise ExtractorError (
' Lecture %s is not a video ' % lecture_id , expected = True )
2014-03-14 00:36:14 +07:00
2016-04-01 22:38:56 +06:00
stream_url = asset . get ( ' stream_url ' ) or asset . get ( ' streamUrl ' )
2015-12-02 00:48:27 +06:00
if stream_url :
youtube_url = self . _search_regex (
r ' (https?://www \ .youtube \ .com/watch \ ?v=.*) ' , stream_url , ' youtube URL ' , default = None )
if youtube_url :
return self . url_result ( youtube_url , ' Youtube ' )
2014-03-14 00:36:14 +07:00
2016-09-22 21:48:53 +07:00
video_id = compat_str ( asset [ ' id ' ] )
2016-04-01 22:38:56 +06:00
thumbnail = asset . get ( ' thumbnail_url ' ) or asset . get ( ' thumbnailUrl ' )
2015-12-02 00:48:27 +06:00
duration = float_or_none ( asset . get ( ' data ' , { } ) . get ( ' duration ' ) )
2017-04-15 03:26:19 +07:00
subtitles = { }
automatic_captions = { }
2015-12-02 00:48:27 +06:00
formats = [ ]
2016-03-25 02:27:13 +06:00
2017-04-15 03:26:19 +07:00
def extract_output_format ( src , f_id ) :
2016-03-25 02:27:13 +06:00
return {
2017-05-25 22:28:26 +07:00
' url ' : src . get ( ' url ' ) ,
2017-04-15 03:26:19 +07:00
' format_id ' : ' %s p ' % ( src . get ( ' height ' ) or f_id ) ,
2016-03-25 02:27:13 +06:00
' width ' : int_or_none ( src . get ( ' width ' ) ) ,
' height ' : int_or_none ( src . get ( ' height ' ) ) ,
' vbr ' : int_or_none ( src . get ( ' video_bitrate_in_kbps ' ) ) ,
' vcodec ' : src . get ( ' video_codec ' ) ,
' fps ' : int_or_none ( src . get ( ' frame_rate ' ) ) ,
' abr ' : int_or_none ( src . get ( ' audio_bitrate_in_kbps ' ) ) ,
' acodec ' : src . get ( ' audio_codec ' ) ,
' asr ' : int_or_none ( src . get ( ' audio_sample_rate ' ) ) ,
' tbr ' : int_or_none ( src . get ( ' total_bitrate_in_kbps ' ) ) ,
' filesize ' : int_or_none ( src . get ( ' file_size_in_bytes ' ) ) ,
2015-12-02 00:48:27 +06:00
}
2016-03-25 02:27:13 +06:00
outputs = asset . get ( ' data ' , { } ) . get ( ' outputs ' )
if not isinstance ( outputs , dict ) :
outputs = { }
2016-03-27 00:02:52 +06:00
def add_output_format_meta ( f , key ) :
2016-03-26 23:54:12 +06:00
output = outputs . get ( key )
if isinstance ( output , dict ) :
2017-04-15 03:26:19 +07:00
output_format = extract_output_format ( output , key )
2016-03-26 23:54:12 +06:00
output_format . update ( f )
return output_format
2016-03-27 00:02:52 +06:00
return f
2016-03-26 23:54:12 +06:00
2017-04-15 03:26:19 +07:00
def extract_formats ( source_list ) :
if not isinstance ( source_list , list ) :
return
for source in source_list :
video_url = source . get ( ' file ' ) or source . get ( ' src ' )
if not video_url or not isinstance ( video_url , compat_str ) :
continue
2017-12-09 20:02:19 +07:00
if source . get ( ' type ' ) == ' application/x-mpegURL ' or determine_ext ( video_url ) == ' m3u8 ' :
formats . extend ( self . _extract_m3u8_formats (
video_url , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = ' hls ' , fatal = False ) )
continue
2017-04-15 03:26:19 +07:00
format_id = source . get ( ' label ' )
f = {
' url ' : video_url ,
' format_id ' : ' %s p ' % format_id ,
' height ' : int_or_none ( format_id ) ,
}
if format_id :
# Some videos contain additional metadata (e.g.
# https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
f = add_output_format_meta ( f , format_id )
formats . append ( f )
2017-08-04 23:44:07 +07:00
def extract_subtitles ( track_list ) :
if not isinstance ( track_list , list ) :
return
for track in track_list :
if not isinstance ( track , dict ) :
continue
if track . get ( ' kind ' ) != ' captions ' :
continue
src = track . get ( ' src ' )
if not src or not isinstance ( src , compat_str ) :
continue
lang = track . get ( ' language ' ) or track . get (
' srclang ' ) or track . get ( ' label ' )
sub_dict = automatic_captions if track . get (
' autogenerated ' ) is True else subtitles
sub_dict . setdefault ( lang , [ ] ) . append ( {
' url ' : src ,
} )
2016-03-25 02:27:13 +06:00
download_urls = asset . get ( ' download_urls ' )
if isinstance ( download_urls , dict ) :
2017-04-15 03:26:19 +07:00
extract_formats ( download_urls . get ( ' Video ' ) )
2015-12-01 20:35:46 +06:00
2016-03-26 23:42:34 +06:00
view_html = lecture . get ( ' view_html ' )
if view_html :
view_html_urls = set ( )
for source in re . findall ( r ' <source[^>]+> ' , view_html ) :
attributes = extract_attributes ( source )
src = attributes . get ( ' src ' )
if not src :
continue
res = attributes . get ( ' data-res ' )
height = int_or_none ( res )
if src in view_html_urls :
continue
view_html_urls . add ( src )
if attributes . get ( ' type ' ) == ' application/x-mpegURL ' or determine_ext ( src ) == ' m3u8 ' :
m3u8_formats = self . _extract_m3u8_formats (
src , video_id , ' mp4 ' , entry_protocol = ' m3u8_native ' ,
m3u8_id = ' hls ' , fatal = False )
for f in m3u8_formats :
m = re . search ( r ' /hls_(?P<height> \ d { 3,4})_(?P<tbr> \ d { 2,})/ ' , f [ ' url ' ] )
if m :
if not f . get ( ' height ' ) :
f [ ' height ' ] = int ( m . group ( ' height ' ) )
if not f . get ( ' tbr ' ) :
f [ ' tbr ' ] = int ( m . group ( ' tbr ' ) )
formats . extend ( m3u8_formats )
else :
2016-03-26 23:54:12 +06:00
formats . append ( add_output_format_meta ( {
2016-03-26 23:42:34 +06:00
' url ' : src ,
2016-03-27 00:02:52 +06:00
' format_id ' : ' %d p ' % height if height else None ,
2016-03-26 23:42:34 +06:00
' height ' : height ,
2016-03-27 00:02:52 +06:00
} , res ) )
2016-03-26 23:42:34 +06:00
2017-04-15 03:26:19 +07:00
# react rendition since 2017.04.15 (see
# https://github.com/rg3/youtube-dl/issues/12744)
data = self . _parse_json (
self . _search_regex (
r ' videojs-setup-data=([ " \' ])(?P<data> { .+?}) \ 1 ' , view_html ,
' setup data ' , default = ' {} ' , group = ' data ' ) , video_id ,
transform_source = unescapeHTML , fatal = False )
if data and isinstance ( data , dict ) :
extract_formats ( data . get ( ' sources ' ) )
if not duration :
duration = int_or_none ( data . get ( ' duration ' ) )
2017-08-04 23:44:07 +07:00
extract_subtitles ( data . get ( ' tracks ' ) )
if not subtitles and not automatic_captions :
text_tracks = self . _parse_json (
self . _search_regex (
r ' text-tracks=([ " \' ])(?P<data> \ [.+? \ ]) \ 1 ' , view_html ,
' text tracks ' , default = ' {} ' , group = ' data ' ) , video_id ,
transform_source = lambda s : js_to_json ( unescapeHTML ( s ) ) ,
fatal = False )
extract_subtitles ( text_tracks )
2017-04-15 03:26:19 +07:00
2018-05-01 02:15:43 +07:00
if not formats and outputs :
for format_id , output in outputs . items ( ) :
f = extract_output_format ( output , format_id )
if f . get ( ' url ' ) :
formats . append ( f )
2016-03-26 23:42:46 +06:00
self . _sort_formats ( formats , field_preference = ( ' height ' , ' width ' , ' tbr ' , ' format_id ' ) )
2014-03-14 00:36:14 +07:00
return {
' id ' : video_id ,
' title ' : title ,
' description ' : description ,
' thumbnail ' : thumbnail ,
' duration ' : duration ,
2017-04-15 03:26:19 +07:00
' formats ' : formats ,
' subtitles ' : subtitles ,
' automatic_captions ' : automatic_captions ,
2014-03-14 00:36:14 +07:00
}
class UdemyCourseIE ( UdemyIE ) :
IE_NAME = ' udemy:course '
2016-09-08 18:29:05 +07:00
_VALID_URL = r ' https?://(?:www \ .)?udemy \ .com/(?P<id>[^/?#&]+) '
2014-03-16 07:09:10 +01:00
_TESTS = [ ]
2014-03-14 00:36:14 +07:00
@classmethod
def suitable ( cls , url ) :
return False if UdemyIE . suitable ( url ) else super ( UdemyCourseIE , cls ) . suitable ( url )
def _real_extract ( self , url ) :
2015-12-02 00:48:27 +06:00
course_path = self . _match_id ( url )
webpage = self . _download_webpage ( url , course_path )
2014-03-14 00:36:14 +07:00
2016-03-31 22:05:25 +06:00
course_id , title = self . _extract_course_info ( webpage , course_path )
2014-03-14 00:36:14 +07:00
2016-03-25 02:26:46 +06:00
self . _enroll_course ( url , webpage , course_id )
2014-03-14 00:36:14 +07:00
2016-03-31 22:59:19 +06:00
response = self . _download_json (
2016-03-31 22:05:25 +06:00
' https://www.udemy.com/api-2.0/courses/ %s /cached-subscriber-curriculum-items ' % course_id ,
2016-03-31 22:59:19 +06:00
course_id , ' Downloading course curriculum ' , query = {
2016-03-31 22:05:25 +06:00
' fields[chapter] ' : ' title,object_index ' ,
2016-04-01 22:38:56 +06:00
' fields[lecture] ' : ' title,asset ' ,
2016-03-31 22:05:25 +06:00
' page_size ' : ' 1000 ' ,
} )
2015-12-31 03:11:21 +06:00
entries = [ ]
2016-03-31 22:05:25 +06:00
chapter , chapter_number = [ None ] * 2
for entry in response [ ' results ' ] :
clazz = entry . get ( ' _class ' )
if clazz == ' lecture ' :
2016-04-01 22:38:56 +06:00
asset = entry . get ( ' asset ' )
if isinstance ( asset , dict ) :
asset_type = asset . get ( ' asset_type ' ) or asset . get ( ' assetType ' )
if asset_type != ' Video ' :
continue
2016-03-31 22:05:25 +06:00
lecture_id = entry . get ( ' id ' )
if lecture_id :
2015-12-31 03:11:21 +06:00
entry = {
' _type ' : ' url_transparent ' ,
2016-04-01 02:24:22 +06:00
' url ' : ' https://www.udemy.com/ %s /learn/v4/t/lecture/ %s ' % ( course_path , entry [ ' id ' ] ) ,
2016-03-31 22:05:25 +06:00
' title ' : entry . get ( ' title ' ) ,
2015-12-31 03:11:21 +06:00
' ie_key ' : UdemyIE . ie_key ( ) ,
}
2016-01-01 20:34:29 +06:00
if chapter_number :
entry [ ' chapter_number ' ] = chapter_number
2015-12-31 03:11:21 +06:00
if chapter :
entry [ ' chapter ' ] = chapter
entries . append ( entry )
2016-03-31 22:05:25 +06:00
elif clazz == ' chapter ' :
chapter_number = entry . get ( ' object_index ' )
chapter = entry . get ( ' title ' )
2014-03-14 00:36:14 +07:00
2016-03-31 22:05:25 +06:00
return self . playlist_result ( entries , course_id , title )